Merge branch 'wheremyfoodat:master' into master

Ishan09811 · Mar 15, 2024 · 941c7f3 · 941c7f3
2 parents 126169e + a00a5e0
commit 941c7f3
Show file tree

Hide file tree

Showing 11 changed files with 526 additions and 194 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,7 +19,7 @@ build/
 
 .vs/
 .vscode/*.log
-
+.cache/
 ipch/
 *.aps
 *.ncb

diff --git a/.gitmodules b/.gitmodules
@@ -61,3 +61,9 @@
 [submodule "third_party/dynarmic"]
 	path = third_party/dynarmic
 	url = https://github.com/Panda3DS-emu/dynarmic
+[submodule "third_party/nihstro"]
+	path = third_party/nihstro
+	url = https://github.com/neobrain/nihstro.git
+[submodule "third_party/Catch2"]
+	path = third_party/Catch2
+	url = https://github.com/catchorg/Catch2.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
diff --git a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp
@@ -42,6 +42,9 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
 	oaknut::Label emitLog2Func();
 	oaknut::Label emitExp2Func();
 
+	// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
+	void emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0);
+
 	template <typename T>
 	T getLabelPointer(const oaknut::Label& label) {
 		auto pointer = reinterpret_cast<u8*>(oaknut::CodeBlock::ptr()) + label.offset();
@@ -123,9 +126,7 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
 	ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {}
 
 	// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
-	InstructionCallback getInstructionCallback(u32 pc) {
-		return getLabelPointer<InstructionCallback>(instructionLabels.at(pc));
-	}
+	InstructionCallback getInstructionCallback(u32 pc) { return getLabelPointer<InstructionCallback>(instructionLabels.at(pc)); }
 
 	PrologueCallback getPrologueCallback() { return prologueCb; }
 	void compile(const PICAShader& shaderUnit);

diff --git a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
@@ -7,6 +7,9 @@ using namespace Helpers;
 using namespace oaknut;
 using namespace oaknut::util;
 
+// TODO: Expose safe/unsafe optimizations to the user
+constexpr bool useSafeMUL = true;
+
 // Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
 // So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
 static constexpr QReg scratch1 = Q0;
@@ -474,14 +477,18 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
 	const u32 dest = getBits<21, 5>(instruction);
 	const u32 writeMask = getBits<0, 4>(operandDescriptor);
 
-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
 	// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
 	INS(src1_vec.Selem()[3], WZR);
 
 	// Now do a full DP4
-	FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());   // Do a piecewise multiplication of the vectors first
+	// Do a piecewise multiplication of the vectors first
+	if constexpr (useSafeMUL) {
+		emitSafeMUL(src1_vec, src2_vec, scratch1);
+	} else {
+		FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	}
 	FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4());  // Now add the adjacent components together
 	FADDP(src1_vec.toS(), src1_vec.toD().S2());          // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
 
@@ -500,11 +507,15 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
 	const u32 dest = getBits<21, 5>(instruction);
 	const u32 writeMask = getBits<0, 4>(operandDescriptor);
 
-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
 
-	FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());   // Do a piecewise multiplication of the vectors first
+	// Do a piecewise multiplication of the vectors first
+	if constexpr (useSafeMUL) {
+		emitSafeMUL(src1_vec, src2_vec, scratch1);
+	} else {
+		FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	}
 	FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4());  // Now add the adjacent components together
 	FADDP(src1_vec.toS(), src1_vec.toD().S2());          // Again for the bottom 2 lanes. Now the bottom lane contains the dot product
 
@@ -515,6 +526,20 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
 	storeRegister(src1_vec, shader, dest, operandDescriptor);
 }
 
+void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0) {
+	// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
+	// This can be done by checking for NaNs before and after a multiplication
+
+	// FMULX returns 2.0 in the case of 0.0 * inf or inf * 0.0
+	// Both a FMUL and FMULX are done and the results are compared to each other
+	// In the case that the results are diferent(a 0.0*inf happened), then
+	// a 0.0 is written
+	FMULX(scratch1.S4(), src1.S4(), src2.S4());
+	FMUL(src1.S4(), src1.S4(), src2.S4());
+	CMEQ(scratch1.S4(), scratch1.S4(), src1.S4());
+	AND(src1.B16(), src1.B16(), scratch1.B16());
+}
+
 void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
@@ -561,10 +586,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
 	const u32 idx = getBits<19, 2>(instruction);
 	const u32 dest = getBits<21, 5>(instruction);
 
-	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
 	loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
 	loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
-	FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+
+	if constexpr (useSafeMUL) {
+		emitSafeMUL(src1_vec, src2_vec, scratch1);
+	} else {
+		FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	}
+
 	storeRegister(src1_vec, shader, dest, operandDescriptor);
 }
 
@@ -632,8 +662,12 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
 	loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
 	loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor);
 
-	// TODO: Safe PICA multiplication
-	FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	if constexpr (useSafeMUL) {
+		emitSafeMUL(src1_vec, src2_vec, scratch1);
+		FADD(src3_vec.S4(), src3_vec.S4(), src1_vec.S4());
+	} else {
+		FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
+	}
 	storeRegister(src3_vec, shader, dest, operandDescriptor);
 }
 

diff --git a/src/core/PICA/shader_interpreter.cpp b/src/core/PICA/shader_interpreter.cpp
@@ -223,7 +223,7 @@ void PICAShader::flr(u32 instruction) {
 	u32 componentMask = operandDescriptor & 0xf;
 	for (int i = 0; i < 4; i++) {
 		if (componentMask & (1 << i)) {
-			destVector[3 - i] = f24::fromFloat32(std::floor(srcVector[3 - 1].toFloat32()));
+			destVector[3 - i] = f24::fromFloat32(std::floor(srcVector[3 - i].toFloat32()));
 		}
 	}
 }
@@ -244,8 +244,12 @@ void PICAShader::max(u32 instruction) {
 	u32 componentMask = operandDescriptor & 0xf;
 	for (int i = 0; i < 4; i++) {
 		if (componentMask & (1 << i)) {
-			const auto maximum = srcVec1[3 - i] > srcVec2[3 - i] ? srcVec1[3 - i] : srcVec2[3 - i];
-			destVector[3 - i] = maximum;
+			const float inputA = srcVec1[3 - i].toFloat32();
+			const float inputB = srcVec2[3 - i].toFloat32();
+			// max(NaN, 2.f) -> NaN
+			// max(2.f, NaN) -> 2
+			const auto& maximum = std::isinf(inputB) ? inputB : std::max(inputB, inputA);
+			destVector[3 - i] = f24::fromFloat32(maximum);
 		}
 	}
 }
@@ -266,8 +270,12 @@ void PICAShader::min(u32 instruction) {
 	u32 componentMask = operandDescriptor & 0xf;
 	for (int i = 0; i < 4; i++) {
 		if (componentMask & (1 << i)) {
-			const auto mininum = srcVec1[3 - i] < srcVec2[3 - i] ? srcVec1[3 - i] : srcVec2[3 - i];
-			destVector[3 - i] = mininum;
+			const float inputA = srcVec1[3 - i].toFloat32();
+			const float inputB = srcVec2[3 - i].toFloat32();
+			// min(NaN, 2.f) -> NaN
+			// min(2.f, NaN) -> 2
+			const auto& mininum = std::min(inputB, inputA);
+			destVector[3 - i] = f24::fromFloat32(mininum);
 		}
 	}
 }
@@ -382,7 +390,11 @@ void PICAShader::rcp(u32 instruction) {
 	vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);
 
 	vec4f& destVector = getDest(dest);
-	f24 res = f24::fromFloat32(1.0f) / srcVec1[0];
+	float input = srcVec1[0].toFloat32();
+	if (input == -0.0f) {
+		input = 0.0f;
+	}
+	const f24 res = f24::fromFloat32(1.0f / input);
 
 	u32 componentMask = operandDescriptor & 0xf;
 	for (int i = 0; i < 4; i++) {
@@ -402,7 +414,11 @@ void PICAShader::rsq(u32 instruction) {
 	vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);
 
 	vec4f& destVector = getDest(dest);
-	f24 res = f24::fromFloat32(1.0f / std::sqrt(srcVec1[0].toFloat32()));
+	float input = srcVec1[0].toFloat32();
+	if (input == -0.0f) {
+		input = 0.0f;
+	}
+	const f24 res = f24::fromFloat32(1.0f / std::sqrt(input));
 
 	u32 componentMask = operandDescriptor & 0xf;
 	for (int i = 0; i < 4; i++) {

diff --git a/src/core/services/boss.cpp b/src/core/services/boss.cpp
@@ -15,6 +15,8 @@ namespace BOSSCommands {
 		GetTaskIdList = 0x000E0000,
 		GetNsDataIdList = 0x00100102,
 		GetNsDataIdList1 = 0x00110102,
+		GetNsDataIdList2 = 0x00120102,
+		GetNsDataIdList3 = 0x00130102,
 		SendProperty = 0x00140082,
 		ReceiveProperty = 0x00160082,
 		GetTaskServiceStatus = 0x001B0042,
@@ -40,7 +42,9 @@ void BOSSService::handleSyncRequest(u32 messagePointer) {
 		case BOSSCommands::GetErrorCode: getErrorCode(messagePointer); break;
 		case BOSSCommands::GetNewArrivalFlag: getNewArrivalFlag(messagePointer); break;
 		case BOSSCommands::GetNsDataIdList:
-		case BOSSCommands::GetNsDataIdList1: 
+		case BOSSCommands::GetNsDataIdList1:
+		case BOSSCommands::GetNsDataIdList2:
+		case BOSSCommands::GetNsDataIdList3:
 			getNsDataIdList(messagePointer, command); break;
 		case BOSSCommands::GetOptoutFlag: getOptoutFlag(messagePointer); break;
 		case BOSSCommands::GetStorageEntryInfo: getStorageEntryInfo(messagePointer); break;
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,7 @@ build/ @@
     .vs/
     .vscode/*.log
+    .cache/
     ipch/
     *.aps
     *.ncb
@@ Expand Down @@