Skip to content

Commit

Permalink
Fix: Mark GLB chunk reading functions as inline, and some other minor…
Browse files Browse the repository at this point in the history
… refactors
  • Loading branch information
spnda committed May 12, 2024
1 parent c2f4f0e commit 1f89114
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 18 deletions.
1 change: 0 additions & 1 deletion examples/gl_viewer/gl_viewer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#include <glm/gtc/matrix_transform.hpp>
#include <glm/gtc/matrix_inverse.hpp>
#include <glm/gtc/type_ptr.hpp>
#include <glm/gtx/quaternion.hpp>

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
Expand Down
12 changes: 10 additions & 2 deletions include/fastgltf/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@

#if (!defined(_MSVC_LANG) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
#define FASTGLTF_CPP_20 1
#include <version>
#else
#define FASTGLTF_CPP_20 0
#endif
Expand Down Expand Up @@ -238,7 +239,7 @@ namespace fastgltf {
[[gnu::hot, gnu::const]] std::uint32_t sse_crc32c(std::string_view str) noexcept;
[[gnu::hot, gnu::const]] std::uint32_t sse_crc32c(const std::uint8_t* d, std::size_t len) noexcept;
#elif defined(FASTGLTF_IS_A64) && !defined(_MSC_VER) && !defined(__ANDROID__)
// Both MSVC stdlib and Android NDK don't include the arm intrinsics
// Both MSVC stdlib and Android NDK don't include the arm intrinsics. TODO: Find a workaround?
#define FASTGLTF_ENABLE_ARMV8_CRC 1
[[gnu::hot, gnu::const]] std::uint32_t armv8_crc32c(std::string_view str) noexcept;
[[gnu::hot, gnu::const]] std::uint32_t armv8_crc32c(const std::uint8_t* d, std::size_t len) noexcept;
Expand Down Expand Up @@ -365,16 +366,23 @@ namespace fastgltf {
}
#endif

#if FASTGLTF_CPP_20 && defined(__cpp_lib_byteswap) && __cpp_lib_byteswap >= 202110L
template<class T>
constexpr T byteswap(T n) noexcept {
return std::byteswap(n);
}
#else
template<typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
#if FASTGLTF_CONSTEXPR_BITCAST
constexpr
#endif
T byteswap(T value) noexcept {
auto byteswap(T value) noexcept {
static_assert(std::has_unique_object_representations_v<T>, "T may not have padding bits");
auto bytes = bit_cast<std::array<std::byte, sizeof(T)>>(value);
bytes = decltype(bytes)(bytes.rbegin(), bytes.rend());
return bit_cast<T>(bytes);
}
#endif

/**
* Returns the absolute value of the given integer in its unsigned type.
Expand Down
17 changes: 9 additions & 8 deletions src/base64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ namespace fg = fastgltf;

#if defined(_MSC_VER)
#define FORCEINLINE __forceinline
#elif defined(__GNUC__) || defined(__clang__)
#define FORCEINLINE [[gnu::always_inline]]
#else
// On other compilers we need the inline specifier, so that the functions in this compilation unit
// can be properly inlined without the "function body can be overwritten at link time" error.
Expand Down Expand Up @@ -118,8 +120,7 @@ namespace fastgltf::base64 {
// The AVX and SSE decoding functions are based on http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html.
// It covers various methods of en-/decoding base64 using SSE and AVX and also shows their
// performance metrics.
// TODO: Mark these functions with msvc::forceinline which is available from C++20
[[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_lookup_pshufb_bitmask(const __m256i input) {
[[gnu::target("avx2")]] FORCEINLINE auto avx2_lookup_pshufb_bitmask(const __m256i input) {
const auto higher_nibble = _mm256_and_si256(_mm256_srli_epi32(input, 4), _mm256_set1_epi8(0x0f));

const auto shiftLUT = _mm256_setr_epi8(
Expand All @@ -136,7 +137,7 @@ namespace fastgltf::base64 {
return _mm256_add_epi8(input, shift);
}

[[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_pack_ints(__m256i input) {
[[gnu::target("avx2")]] FORCEINLINE auto avx2_pack_ints(__m256i input) {
const auto merge = _mm256_maddubs_epi16(input, _mm256_set1_epi32(0x01400140));
return _mm256_madd_epi16(merge, _mm256_set1_epi32(0x00011000));
}
Expand Down Expand Up @@ -203,7 +204,7 @@ namespace fastgltf::base64 {
return ret;
}

[[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_lookup_pshufb_bitmask(const __m128i input) {
[[gnu::target("sse4.1")]] FORCEINLINE auto sse4_lookup_pshufb_bitmask(const __m128i input) {
const auto higher_nibble = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));

const auto shiftLUT = _mm_setr_epi8(
Expand All @@ -217,7 +218,7 @@ namespace fastgltf::base64 {
return _mm_add_epi8(input, shift);
}

[[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_pack_ints(__m128i input) {
[[gnu::target("sse4.1")]] FORCEINLINE auto sse4_pack_ints(__m128i input) {
const auto merge = _mm_maddubs_epi16(input, _mm_set1_epi32(0x01400140));
return _mm_madd_epi16(merge, _mm_set1_epi32(0x00011000));
}
Expand Down Expand Up @@ -278,7 +279,7 @@ namespace fastgltf::base64 {
return ret;
}
#elif defined(FASTGLTF_IS_A64)
[[gnu::always_inline]] FORCEINLINE int8x16_t neon_lookup_pshufb_bitmask(const uint8x16_t input) {
FORCEINLINE int8x16_t neon_lookup_pshufb_bitmask(const uint8x16_t input) {
// clang-format off
constexpr std::array<int8_t, 16> shiftLUTdata = {
0, 0, 19, 4, -65, -65, -71, -71,
Expand All @@ -297,7 +298,7 @@ namespace fastgltf::base64 {
return vaddq_s8(input, shift);
}

[[gnu::always_inline]] FORCEINLINE int16x8_t neon_pack_ints(const int8x16_t input) {
FORCEINLINE int16x8_t neon_pack_ints(const int8x16_t input) {
const uint32x4_t mask = vdupq_n_u32(0x01400140);

const int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input))), vmovl_s8(vget_low_s8(mask)));
Expand Down Expand Up @@ -386,7 +387,7 @@ static constexpr std::array<std::uint8_t, 128> base64lut = {

namespace fastgltf::base64 {
template <typename Output>
[[gnu::always_inline]] FORCEINLINE void decode_block(std::array<std::uint8_t, 4>& sixBitChars, Output output) {
FORCEINLINE void decode_block(std::array<std::uint8_t, 4>& sixBitChars, Output output) {
for (std::size_t i = 0; i < 4; i++) {
assert(static_cast<std::size_t>(sixBitChars[i]) < base64lut.size());
sixBitChars[i] = base64lut[sixBitChars[i]];
Expand Down
13 changes: 6 additions & 7 deletions src/fastgltf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,22 @@ namespace fastgltf {
static_assert(sizeof(BinaryGltfHeader) == 12, "Binary gltf header must be 12 bytes");
static_assert(std::is_trivially_copyable_v<BinaryGltfHeader>);

void readUint32LE(std::uint32_t& x, std::byte* bytes) noexcept {
constexpr void readUint32LE(std::uint32_t& x, std::byte* bytes) noexcept {
x = std::uint32_t(bytes[0])
| (std::uint32_t(bytes[1]) << 8)
| (std::uint32_t(bytes[2]) << 16)
| (std::uint32_t(bytes[3]) << 24);
}

void writeUint32LE(std::uint32_t x, std::byte* buffer) noexcept {
constexpr void writeUint32LE(std::uint32_t x, std::byte* buffer) noexcept {
buffer[0] = static_cast<std::byte>(x);
buffer[1] = static_cast<std::byte>(x >> 8);
buffer[2] = static_cast<std::byte>(x >> 16);
buffer[3] = static_cast<std::byte>(x >> 24);
}

/** GLBs are always little-endian, meaning we need to read the values accordingly */
[[nodiscard, gnu::always_inline]] auto readBinaryHeader(GltfDataGetter& getter) noexcept {
[[nodiscard, gnu::always_inline]] inline auto readBinaryHeader(GltfDataGetter& getter) noexcept {
std::array<std::byte, sizeof(BinaryGltfHeader)> bytes {};
getter.read(bytes.data(), bytes.size());

Expand All @@ -99,7 +99,7 @@ namespace fastgltf {
return header;
}

[[gnu::always_inline]] auto writeBinaryHeader(const BinaryGltfHeader& header) noexcept {
[[gnu::always_inline]] inline auto writeBinaryHeader(const BinaryGltfHeader& header) noexcept {
std::array<std::byte, sizeof(BinaryGltfHeader)> bytes {};
writeUint32LE(header.magic, &bytes[offsetof(BinaryGltfHeader, magic)]);
writeUint32LE(header.version, &bytes[offsetof(BinaryGltfHeader, version)]);
Expand All @@ -113,7 +113,7 @@ namespace fastgltf {
};
static_assert(std::is_trivially_copyable_v<BinaryGltfChunk>);

[[nodiscard, gnu::always_inline]] auto readBinaryChunk(GltfDataGetter& getter) noexcept {
[[nodiscard, gnu::always_inline]] inline auto readBinaryChunk(GltfDataGetter& getter) noexcept {
std::array<std::byte, sizeof(BinaryGltfChunk)> bytes {};
getter.read(bytes.data(), bytes.size());

Expand All @@ -123,7 +123,7 @@ namespace fastgltf {
return chunk;
}

[[gnu::always_inline]] auto writeBinaryChunk(const BinaryGltfChunk& chunk) noexcept {
[[gnu::always_inline]] inline auto writeBinaryChunk(const BinaryGltfChunk& chunk) noexcept {
std::array<std::byte, sizeof(BinaryGltfChunk)> bytes {};
writeUint32LE(chunk.chunkLength, &bytes[offsetof(BinaryGltfChunk, chunkLength)]);
writeUint32LE(chunk.chunkType, &bytes[offsetof(BinaryGltfChunk, chunkType)]);
Expand Down Expand Up @@ -173,7 +173,6 @@ namespace fastgltf {

// Decrementing the length variable and incrementing the pointer directly has better codegen with Clang
// than using a std::size_t i = 0.
// TODO: is there perhaps just some intrinsic we can use instead of inline asm?
auto length = static_cast<std::int64_t>(len);
while ((length -= sizeof(std::uint64_t)) >= 0) {
std::uint64_t value;
Expand Down

0 comments on commit 1f89114

Please sign in to comment.