Skip to content

Commit

Permalink
add example packing floats
Browse files Browse the repository at this point in the history
Also refactor IntegralReference into common header.
  • Loading branch information
bernhardmgruber committed Nov 15, 2021
1 parent 293b0ec commit 75cb6fb
Show file tree
Hide file tree
Showing 5 changed files with 437 additions and 136 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory("examples/raycast")
add_subdirectory("examples/bitpack")
add_subdirectory("examples/bytesplit")
add_subdirectory("examples/floatpack")

# alpaka examples
find_package(alpaka 0.7.0 QUIET)
Expand Down
218 changes: 82 additions & 136 deletions examples/bitpack/bitpack.cpp
Original file line number Diff line number Diff line change
@@ -1,166 +1,112 @@
#include "../common/IntegralReference.hpp"

#include <cstdint>
#include <fmt/core.h>
#include <llama/llama.hpp>

// clang-format off
namespace tag
namespace mapping
{
struct X{};
struct Y{};
struct Z{};
} // namespace tag

using Vector = llama::Record<
llama::Field<tag::X, std::uint16_t>,
llama::Field<tag::Y, std::int32_t>,
llama::Field<tag::Z, std::uint64_t>
>;
// clang-format on

template<
typename TArrayExtents,
typename TRecordDim,
typename LinearizeArrayDimsFunctor = llama::mapping::LinearizeArrayDimsCpp>
struct BitpackSoA : TArrayExtents
{
using ArrayExtents = TArrayExtents;
using ArrayIndex = typename ArrayExtents::Index;
using RecordDim = TRecordDim;

static constexpr std::size_t blobCount = boost::mp11::mp_size<llama::FlatRecordDim<RecordDim>>::value;

constexpr BitpackSoA() = default;

LLAMA_FN_HOST_ACC_INLINE
constexpr explicit BitpackSoA(unsigned bits, ArrayExtents extents, RecordDim = {})
: ArrayExtents(extents)
, bits{bits}
template<
typename TArrayExtents,
typename TRecordDim,
typename LinearizeArrayDimsFunctor = llama::mapping::LinearizeArrayDimsCpp>
struct BitpackSoA : TArrayExtents
{
}
using ArrayExtents = TArrayExtents;
using ArrayIndex = typename ArrayExtents::Index;
using RecordDim = TRecordDim;

LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
{
return *this; // NOLINT(cppcoreguidelines-slicing)
}
static constexpr std::size_t blobCount = boost::mp11::mp_size<llama::FlatRecordDim<RecordDim>>::value;

LLAMA_FN_HOST_ACC_INLINE
constexpr auto blobSize(std::size_t /*blobIndex*/) const -> std::size_t
{
return (LinearizeArrayDimsFunctor{}.size(extents()) * bits + CHAR_BIT - 1) / CHAR_BIT;
}
using StoredIntegral
= std::uint64_t; // TODO(bgruber): we should choose an integral type which is as large as the
// largest type in the record dim. Otherwise, we might violate the alignment of the blobs.

template<std::size_t... RecordCoords>
static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
{
return true;
}
constexpr BitpackSoA() = default;

// FIXME: might violate alignment
using RegisterInt = std::uint64_t;
LLAMA_FN_HOST_ACC_INLINE
constexpr explicit BitpackSoA(unsigned bits, ArrayExtents extents, RecordDim = {})
: ArrayExtents(extents)
, bits{bits}
{
}

template<typename T, typename Pointer>
struct Reference
{
Pointer ptr;
std::size_t bitOffset;
unsigned bits;
LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
{
return *this; // NOLINT(cppcoreguidelines-slicing)
}

static constexpr auto registerBits = sizeof(RegisterInt) * CHAR_BIT;
LLAMA_FN_HOST_ACC_INLINE
constexpr auto blobSize(std::size_t /*blobIndex*/) const -> std::size_t
{
constexpr auto bitsPerStoredIntegral = sizeof(StoredIntegral) * CHAR_BIT;
return (LinearizeArrayDimsFunctor{}.size(extents()) * bits + bitsPerStoredIntegral - 1)
/ bitsPerStoredIntegral;
}

// NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
operator T() const
template<std::size_t... RecordCoords>
static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
{
auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
auto v = p[0] >> innerBitOffset;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset <= registerBits)
{
const auto mask = (RegisterInt{1} << bits) - 1u;
v &= mask;
}
else
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsLoaded = registerBits - innerBitOffset;
const auto mask = (RegisterInt{1} << excessBits) - 1u;
v |= (p[1] & mask) << bitsLoaded;
}
if constexpr(std::is_signed_v<T>)
if((v & (RegisterInt{1} << (bits - 1))) != 0)
{
// sign extend
v |= static_cast<RegisterInt>(-1) << bits;
}
return static_cast<T>(v);
return true;
}

auto operator=(T v) -> Reference&
template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
llama::Array<Blob, blobCount>& blobs) const
{
const auto mask = (RegisterInt{1} << bits) - 1u;
const auto vBits = (static_cast<RegisterInt>(v) & mask);

auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
const auto clearMask = ~(mask << innerBitOffset);
auto m = p[0] & clearMask; // clear previous bits
m |= vBits << innerBitOffset; // write new bits
p[0] = m;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset > registerBits)
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsWritten = registerBits - innerBitOffset;
const auto clearMask = ~((RegisterInt{1} << excessBits) - 1u);
auto m = p[1] & clearMask; // clear previous bits
m |= vBits >> bitsWritten; // write new bits
p[1] = m;
}

return *this;
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return internal::IntegralReference<DstType, StoredIntegral*>{
reinterpret_cast<StoredIntegral*>(&blobs[blob][0]),
bitOffset,
bits};
}
};

template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
llama::Array<Blob, blobCount>& blobs) const
{
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;
template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
const llama::Array<Blob, blobCount>& blobs) const
{
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return internal::IntegralReference<DstType, const StoredIntegral*>{
reinterpret_cast<const StoredIntegral*>(&blobs[blob][0]),
bitOffset,
bits};
}

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return Reference<DstType, RegisterInt*>{reinterpret_cast<RegisterInt*>(&blobs[blob][0]), bitOffset, bits};
}
private:
unsigned bits = 0;
};
} // namespace mapping

template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
const llama::Array<Blob, blobCount>& blobs) const
{
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return Reference<DstType, const RegisterInt*>{
reinterpret_cast<const RegisterInt*>(&blobs[blob][0]),
bitOffset,
bits};
}
// clang-format off
namespace tag
{
struct X{};
struct Y{};
struct Z{};
} // namespace tag

private:
unsigned bits = 0;
};
using Vector = llama::Record<
llama::Field<tag::X, std::uint16_t>,
llama::Field<tag::Y, std::int32_t>,
llama::Field<tag::Z, std::uint64_t>
>;
// clang-format on

auto main() -> int
{
constexpr auto N = 128;
constexpr auto bits = 7;
const auto mapping = BitpackSoA{bits, llama::ArrayExtents<llama::dyn>{N}, Vector{}};
const auto mapping = mapping::BitpackSoA{bits, llama::ArrayExtents<llama::dyn>{N}, Vector{}};

auto view = llama::allocView(mapping);

Expand Down
83 changes: 83 additions & 0 deletions examples/common/IntegralReference.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#pragma once

#include <climits>
#include <type_traits>

namespace internal
{
/// A proxy type representing a reference to an integral value, stored in a buffer at a specified bit offset.
/// @tparam Integral Integral data type which can be loaded and store through this reference.
/// @tparam StoredIntegralPointer Pointer to integral type used for storing the bits.
template<typename Integral, typename StoredIntegralPointer>
struct IntegralReference
{
using StoredIntegral = std::remove_const_t<std::remove_pointer_t<StoredIntegralPointer>>;

static_assert(std::is_integral_v<Integral>);
static_assert(std::is_integral_v<StoredIntegral>);
static_assert(
sizeof(StoredIntegral) >= sizeof(Integral),
"The integral type used for the storage must be at least as big as the type of the values to retrieve");

StoredIntegralPointer ptr;
std::size_t bitOffset;
unsigned bits;

static constexpr auto registerBits = sizeof(StoredIntegral) * CHAR_BIT;

// NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
operator Integral() const
{
auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
auto v = p[0] >> innerBitOffset;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset <= registerBits)
{
const auto mask = (StoredIntegral{1} << bits) - 1u;
v &= mask;
}
else
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsLoaded = registerBits - innerBitOffset;
const auto mask = (StoredIntegral{1} << excessBits) - 1u;
v |= (p[1] & mask) << bitsLoaded;
}
if constexpr(std::is_signed_v<Integral>)
if((v & (StoredIntegral{1} << (bits - 1))) != 0)
{
// sign extend
v |= static_cast<StoredIntegral>(-1) << bits;
}
return static_cast<Integral>(v);
}

auto operator=(Integral v) -> IntegralReference&
{
const auto mask = (StoredIntegral{1} << bits) - 1u;
const auto vBits = (static_cast<StoredIntegral>(v) & mask);

auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
const auto clearMask = ~(mask << innerBitOffset);
auto m = p[0] & clearMask; // clear previous bits
m |= vBits << innerBitOffset; // write new bits
p[0] = m;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset > registerBits)
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsWritten = registerBits - innerBitOffset;
const auto clearMask = ~((StoredIntegral{1} << excessBits) - 1u);
auto m = p[1] & clearMask; // clear previous bits
m |= vBits >> bitsWritten; // write new bits
p[1] = m;
}

return *this;
}
};
} // namespace internal
9 changes: 9 additions & 0 deletions examples/floatpack/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
cmake_minimum_required (VERSION 3.15)
project(llama-floatpack CXX)

if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
add_executable(${PROJECT_NAME} floatpack.cpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
Loading

0 comments on commit 75cb6fb

Please sign in to comment.