Fix: Mark GLB chunk reading functions as inline, and some other minor…

… refactors
spnda · May 12, 2024 · 1f89114 · 1f89114
1 parent c2f4f0e
commit 1f89114
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 18 deletions.
diff --git a/examples/gl_viewer/gl_viewer.cpp b/examples/gl_viewer/gl_viewer.cpp
@@ -40,7 +40,6 @@
 #include <glm/gtc/matrix_transform.hpp>
 #include <glm/gtc/matrix_inverse.hpp>
 #include <glm/gtc/type_ptr.hpp>
-#include <glm/gtx/quaternion.hpp>
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

diff --git a/include/fastgltf/util.hpp b/include/fastgltf/util.hpp
@@ -45,6 +45,7 @@
 
 #if (!defined(_MSVC_LANG) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
 #define FASTGLTF_CPP_20 1
+#include <version>
 #else
 #define FASTGLTF_CPP_20 0
 #endif
@@ -238,7 +239,7 @@ namespace fastgltf {
     [[gnu::hot, gnu::const]] std::uint32_t sse_crc32c(std::string_view str) noexcept;
     [[gnu::hot, gnu::const]] std::uint32_t sse_crc32c(const std::uint8_t* d, std::size_t len) noexcept;
 #elif defined(FASTGLTF_IS_A64) && !defined(_MSC_VER) && !defined(__ANDROID__)
-	// Both MSVC stdlib and Android NDK don't include the arm intrinsics
+	// Both MSVC stdlib and Android NDK don't include the arm intrinsics. TODO: Find a workaround?
 #define FASTGLTF_ENABLE_ARMV8_CRC 1
 	[[gnu::hot, gnu::const]] std::uint32_t armv8_crc32c(std::string_view str) noexcept;
 	[[gnu::hot, gnu::const]] std::uint32_t armv8_crc32c(const std::uint8_t* d, std::size_t len) noexcept;
@@ -365,16 +366,23 @@ namespace fastgltf {
 	}
 #endif
 
+#if FASTGLTF_CPP_20 && defined(__cpp_lib_byteswap) && __cpp_lib_byteswap >= 202110L
+	template<class T>
+	constexpr T byteswap(T n) noexcept {
+		return std::byteswap(n);
+	}
+#else
 	template<typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
 #if FASTGLTF_CONSTEXPR_BITCAST
 	constexpr
 #endif
-	T byteswap(T value) noexcept {
+	auto byteswap(T value) noexcept {
 		static_assert(std::has_unique_object_representations_v<T>, "T may not have padding bits");
 		auto bytes = bit_cast<std::array<std::byte, sizeof(T)>>(value);
 		bytes = decltype(bytes)(bytes.rbegin(), bytes.rend());
 		return bit_cast<T>(bytes);
 	}
+#endif
 
 	/**
 	 * Returns the absolute value of the given integer in its unsigned type.

diff --git a/src/base64.cpp b/src/base64.cpp
@@ -63,6 +63,8 @@ namespace fg = fastgltf;
 
 #if defined(_MSC_VER)
 #define FORCEINLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FORCEINLINE [[gnu::always_inline]]
 #else
 // On other compilers we need the inline specifier, so that the functions in this compilation unit
 // can be properly inlined without the "function body can be overwritten at link time" error.
@@ -118,8 +120,7 @@ namespace fastgltf::base64 {
 // The AVX and SSE decoding functions are based on http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html.
 // It covers various methods of en-/decoding base64 using SSE and AVX and also shows their
 // performance metrics.
-// TODO: Mark these functions with msvc::forceinline which is available from C++20
-[[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_lookup_pshufb_bitmask(const __m256i input) {
+[[gnu::target("avx2")]] FORCEINLINE auto avx2_lookup_pshufb_bitmask(const __m256i input) {
     const auto higher_nibble = _mm256_and_si256(_mm256_srli_epi32(input, 4), _mm256_set1_epi8(0x0f));
 
     const auto shiftLUT = _mm256_setr_epi8(
@@ -136,7 +137,7 @@ namespace fastgltf::base64 {
     return _mm256_add_epi8(input, shift);
 }
 
-[[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_pack_ints(__m256i input) {
+[[gnu::target("avx2")]] FORCEINLINE auto avx2_pack_ints(__m256i input) {
     const auto merge = _mm256_maddubs_epi16(input, _mm256_set1_epi32(0x01400140));
     return _mm256_madd_epi16(merge, _mm256_set1_epi32(0x00011000));
 }
@@ -203,7 +204,7 @@ namespace fastgltf::base64 {
     return ret;
 }
 
-[[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_lookup_pshufb_bitmask(const __m128i input) {
+[[gnu::target("sse4.1")]] FORCEINLINE auto sse4_lookup_pshufb_bitmask(const __m128i input) {
     const auto higher_nibble = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
 
     const auto shiftLUT = _mm_setr_epi8(
@@ -217,7 +218,7 @@ namespace fastgltf::base64 {
     return _mm_add_epi8(input, shift);
 }
 
-[[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_pack_ints(__m128i input) {
+[[gnu::target("sse4.1")]] FORCEINLINE auto sse4_pack_ints(__m128i input) {
     const auto merge = _mm_maddubs_epi16(input, _mm_set1_epi32(0x01400140));
     return _mm_madd_epi16(merge, _mm_set1_epi32(0x00011000));
 }
@@ -278,7 +279,7 @@ namespace fastgltf::base64 {
     return ret;
 }
 #elif defined(FASTGLTF_IS_A64)
-[[gnu::always_inline]] FORCEINLINE int8x16_t neon_lookup_pshufb_bitmask(const uint8x16_t input) {
+FORCEINLINE int8x16_t neon_lookup_pshufb_bitmask(const uint8x16_t input) {
     // clang-format off
     constexpr std::array<int8_t, 16> shiftLUTdata = {
         0,   0,  19,   4, -65, -65, -71, -71,
@@ -297,7 +298,7 @@ namespace fastgltf::base64 {
     return vaddq_s8(input, shift);
 }
 
-[[gnu::always_inline]] FORCEINLINE int16x8_t neon_pack_ints(const int8x16_t input) {
+FORCEINLINE int16x8_t neon_pack_ints(const int8x16_t input) {
     const uint32x4_t mask = vdupq_n_u32(0x01400140);
 
     const int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input))), vmovl_s8(vget_low_s8(mask)));
@@ -386,7 +387,7 @@ static constexpr std::array<std::uint8_t, 128> base64lut = {
 
 namespace fastgltf::base64 {
     template <typename Output>
-	[[gnu::always_inline]] FORCEINLINE void decode_block(std::array<std::uint8_t, 4>& sixBitChars, Output output) {
+	FORCEINLINE void decode_block(std::array<std::uint8_t, 4>& sixBitChars, Output output) {
 		for (std::size_t i = 0; i < 4; i++) {
 			assert(static_cast<std::size_t>(sixBitChars[i]) < base64lut.size());
 			sixBitChars[i] = base64lut[sixBitChars[i]];

diff --git a/src/fastgltf.cpp b/src/fastgltf.cpp
@@ -73,22 +73,22 @@ namespace fastgltf {
     static_assert(sizeof(BinaryGltfHeader) == 12, "Binary gltf header must be 12 bytes");
 	static_assert(std::is_trivially_copyable_v<BinaryGltfHeader>);
 
-	void readUint32LE(std::uint32_t& x, std::byte* bytes) noexcept {
+	constexpr void readUint32LE(std::uint32_t& x, std::byte* bytes) noexcept {
 		x = std::uint32_t(bytes[0])
 				| (std::uint32_t(bytes[1]) << 8)
 				| (std::uint32_t(bytes[2]) << 16)
 				| (std::uint32_t(bytes[3]) << 24);
 	}
 
-	void writeUint32LE(std::uint32_t x, std::byte* buffer) noexcept {
+	constexpr void writeUint32LE(std::uint32_t x, std::byte* buffer) noexcept {
 		buffer[0] = static_cast<std::byte>(x);
 		buffer[1] = static_cast<std::byte>(x >> 8);
 		buffer[2] = static_cast<std::byte>(x >> 16);
 		buffer[3] = static_cast<std::byte>(x >> 24);
 	}
 
 	/** GLBs are always little-endian, meaning we need to read the values accordingly */
-	[[nodiscard, gnu::always_inline]] auto readBinaryHeader(GltfDataGetter& getter) noexcept {
+	[[nodiscard, gnu::always_inline]] inline auto readBinaryHeader(GltfDataGetter& getter) noexcept {
 		std::array<std::byte, sizeof(BinaryGltfHeader)> bytes {};
 		getter.read(bytes.data(), bytes.size());
 
@@ -99,7 +99,7 @@ namespace fastgltf {
 		return header;
 	}
 
-	[[gnu::always_inline]] auto writeBinaryHeader(const BinaryGltfHeader& header) noexcept {
+	[[gnu::always_inline]] inline auto writeBinaryHeader(const BinaryGltfHeader& header) noexcept {
 		std::array<std::byte, sizeof(BinaryGltfHeader)> bytes {};
 		writeUint32LE(header.magic, &bytes[offsetof(BinaryGltfHeader, magic)]);
 		writeUint32LE(header.version, &bytes[offsetof(BinaryGltfHeader, version)]);
@@ -113,7 +113,7 @@ namespace fastgltf {
     };
 	static_assert(std::is_trivially_copyable_v<BinaryGltfChunk>);
 
-	[[nodiscard, gnu::always_inline]] auto readBinaryChunk(GltfDataGetter& getter) noexcept {
+	[[nodiscard, gnu::always_inline]] inline auto readBinaryChunk(GltfDataGetter& getter) noexcept {
 		std::array<std::byte, sizeof(BinaryGltfChunk)> bytes {};
 		getter.read(bytes.data(), bytes.size());
 
@@ -123,7 +123,7 @@ namespace fastgltf {
 		return chunk;
 	}
 
-	[[gnu::always_inline]] auto writeBinaryChunk(const BinaryGltfChunk& chunk) noexcept {
+	[[gnu::always_inline]] inline auto writeBinaryChunk(const BinaryGltfChunk& chunk) noexcept {
 		std::array<std::byte, sizeof(BinaryGltfChunk)> bytes {};
 		writeUint32LE(chunk.chunkLength, &bytes[offsetof(BinaryGltfChunk, chunkLength)]);
 		writeUint32LE(chunk.chunkType, &bytes[offsetof(BinaryGltfChunk, chunkType)]);
@@ -173,7 +173,6 @@ namespace fastgltf {
 
 		// Decrementing the length variable and incrementing the pointer directly has better codegen with Clang
 		// than using a std::size_t i = 0.
-		// TODO: is there perhaps just some intrinsic we can use instead of inline asm?
 		auto length = static_cast<std::int64_t>(len);
 		while ((length -= sizeof(std::uint64_t)) >= 0) {
 			std::uint64_t value;