Skip to content

Commit

Permalink
Separate record dim flattening from field permutations
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Sep 29, 2023
1 parent fa8e470 commit 3dc3665
Show file tree
Hide file tree
Showing 14 changed files with 138 additions and 148 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- added macros `LLAMA_FORCE_INLINE` and `LLAMA_HOST_ACC` #366
- support clang as CUDA compiler #366
- `llama::mapping::SoA` and `llama::mapping::AoSoA` now support custom record dimension flatteners #371
- added the `llama::mapping::FlattenRecordDimIncreasingAlignment`, `llama::mapping::FlattenRecordDimDecreasingAlignment` and `llama::mapping::FlattenRecordDimMinimizePadding` record dimension flatteners #371
- added the `llama::mapping::PermuteFieldsIncreasingAlignment`, `llama::mapping::PermuteFieldsDecreasingAlignment` and `llama::mapping::PermuteFieldsMinimizePadding` record dimension flatteners #371
- added new mapping `llama::mapping::BitPackedIntSoA` bitpacking integers in the record dimension into SoA arrays, and added new example #372, #427, #441, #446
- added new mapping `llama::mapping::BitPackedFloatSoA` bitpacking floating-point types in the record dimension into SoA arrays, and added new example #414, #427, #446
- `LLAMA_FORCE_INLINE` views can be created on `const` blobs #375
Expand Down
14 changes: 7 additions & 7 deletions docs/pages/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,15 @@ Acessors
.. doxygenstruct:: llama::accessor::Restrict
.. doxygenstruct:: llama::accessor::Atomic
.. doxygenstruct:: llama::accessor::Stacked
RecordDim field permuters
^^^^^^^^^^^^^^^^^^^^^^^^^

RecordDim flattener
^^^^^^^^^^^^^^^^^^^

.. doxygenstruct:: llama::mapping::FlattenRecordDimInOrder
.. doxygenstruct:: llama::mapping::FlattenRecordDimSorted
.. doxygentypedef:: llama::mapping::FlattenRecordDimIncreasingAlignment
.. doxygentypedef:: llama::mapping::FlattenRecordDimDecreasingAlignment
.. doxygentypedef:: llama::mapping::FlattenRecordDimMinimizePadding
.. doxygenstruct:: llama::mapping::PermuteFieldsInOrder
.. doxygenstruct:: llama::mapping::PermuteFieldsSorted
.. doxygentypedef:: llama::mapping::PermuteFieldsIncreasingAlignment
.. doxygentypedef:: llama::mapping::PermuteFieldsDecreasingAlignment
.. doxygentypedef:: llama::mapping::PermuteFieldsMinimizePadding

Common utilities
^^^^^^^^^^^^^^^^
Expand Down
8 changes: 4 additions & 4 deletions examples/cuda/pitch/pitch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@ namespace llamaex
typename TArrayExtents,
typename TRecordDim,
bool AlignAndPad = true,
template<typename> typename FlattenRecordDim = mapping::FlattenRecordDimInOrder>
template<typename> typename PermuteFields = mapping::PermuteFieldsInOrder>
struct PitchedAoS : mapping::MappingBase<TArrayExtents, TRecordDim>
{
private:
static constexpr std::size_t dim = TArrayExtents{}.size();

using Base = mapping::MappingBase<TArrayExtents, TRecordDim>;
using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;

Array<std::size_t, dim> pitches;

Expand Down Expand Up @@ -116,9 +116,9 @@ namespace llamaex
#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>;
const auto offset
= dot(pitches, ai) + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
= dot(pitches, ai) + flatOffsetOf<typename Permuter::FlatRecordDim, flatFieldIndex, AlignAndPad>;
return {0, offset};
}
};
Expand Down
2 changes: 1 addition & 1 deletion examples/memmap/memmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ auto main(int argc, const char* argv[]) -> int
Triangle,
llama::mapping::FieldAlignment::Pack,
llama::mapping::LinearizeArrayDimsCpp,
llama::mapping::FlattenRecordDimInOrder>{{n}};
llama::mapping::PermuteFieldsInOrder>{{n}};
if(size != 80u + 4u + mapping.blobSize(0))
{
std::cout << "File size (" << size << ") != 80 + 4 + mapping size: (" << mapping.blobSize(0) << ")\n";
Expand Down
4 changes: 2 additions & 2 deletions include/llama/Simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ namespace llama
{
static_assert(mapping::isAoS<Mapping>);
static constexpr auto srcStride = flatSizeOf<
typename Mapping::Flattener::FlatRecordDim,
typename Mapping::Permuter::FlatRecordDim,
Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
const auto* srcBaseAddr = reinterpret_cast<const std::byte*>(&srcRef(rc));
ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy
Expand Down Expand Up @@ -235,7 +235,7 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
static constexpr auto stride = flatSizeOf<
typename Mapping::Flattener::FlatRecordDim,
typename Mapping::Permuter::FlatRecordDim,
Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
auto* dstBaseAddr = reinterpret_cast<std::byte*>(&dstRef(rc));
const ElementSimd elemSimd = srcSimd(rc);
Expand Down
31 changes: 13 additions & 18 deletions include/llama/mapping/AoS.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ namespace llama::mapping
/// If Pack, struct members are tightly packed.
/// \tparam TLinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
/// how big the linear domain gets.
/// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
/// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
/// \ref FlattenRecordDimMinimizePadding.
/// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref
/// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and
/// \ref PermuteFieldsMinimizePadding.
template<
typename TArrayExtents,
typename TRecordDim,
FieldAlignment TFieldAlignment = FieldAlignment::Align,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
template<typename> typename PermuteFields = PermuteFieldsInOrder>
struct AoS : MappingBase<TArrayExtents, TRecordDim>
{
private:
Expand All @@ -30,15 +30,15 @@ namespace llama::mapping
public:
inline static constexpr FieldAlignment fieldAlignment = TFieldAlignment;
using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;
inline static constexpr std::size_t blobCount = 1;

using Base::Base;

LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type) const -> size_type
{
return LinearizeArrayDimsFunctor{}.size(Base::extents())
* flatSizeOf<typename Flattener::FlatRecordDim, fieldAlignment == FieldAlignment::Align>;
* flatSizeOf<typename Permuter::FlatRecordDim, fieldAlignment == FieldAlignment::Align>;
}

template<std::size_t... RecordCoords>
Expand All @@ -50,13 +50,13 @@ namespace llama::mapping
#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>;
const auto offset
= LinearizeArrayDimsFunctor{}(ai, Base::extents())
* static_cast<size_type>(
flatSizeOf<typename Flattener::FlatRecordDim, fieldAlignment == FieldAlignment::Align>)
flatSizeOf<typename Permuter::FlatRecordDim, fieldAlignment == FieldAlignment::Align>)
+ static_cast<size_type>(flatOffsetOf<
typename Flattener::FlatRecordDim,
typename Permuter::FlatRecordDim,
flatFieldIndex,
fieldAlignment == FieldAlignment::Align>);
return {size_type{0}, offset};
Expand All @@ -75,12 +75,8 @@ namespace llama::mapping
/// Array of struct mapping preserving the alignment of the field types by inserting padding and permuting the
/// field order to minimize this padding. \see AoS
template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
using MinAlignedAoS = AoS<
ArrayExtents,
RecordDim,
FieldAlignment::Align,
LinearizeArrayDimsFunctor,
FlattenRecordDimMinimizePadding>;
using MinAlignedAoS
= AoS<ArrayExtents, RecordDim, FieldAlignment::Align, LinearizeArrayDimsFunctor, PermuteFieldsMinimizePadding>;

/// Array of struct mapping packing the field types tightly, violating the type's alignment requirements.
/// \see AoS
Expand All @@ -107,8 +103,7 @@ namespace llama::mapping
FieldAlignment FieldAlignment,
typename LinearizeArrayDimsFunctor,
template<typename>
typename FlattenRecordDim>
inline constexpr bool
isAoS<AoS<ArrayExtents, RecordDim, FieldAlignment, LinearizeArrayDimsFunctor, FlattenRecordDim>>
typename PermuteFields>
inline constexpr bool isAoS<AoS<ArrayExtents, RecordDim, FieldAlignment, LinearizeArrayDimsFunctor, PermuteFields>>
= true;
} // namespace llama::mapping
15 changes: 7 additions & 8 deletions include/llama/mapping/AoSoA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ namespace llama::mapping

/// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView.
/// \tparam Lanes The size of the inner arrays of this array of struct of arrays.
/// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
/// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
/// \ref FlattenRecordDimMinimizePadding.
/// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref
/// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and
/// \ref PermuteFieldsMinimizePadding.
template<
typename TArrayExtents,
typename TRecordDim,
typename TArrayExtents::value_type Lanes,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
template<typename> typename PermuteFields = PermuteFieldsInOrder>
struct AoSoA : MappingBase<TArrayExtents, TRecordDim>
{
private:
Expand All @@ -44,7 +44,7 @@ namespace llama::mapping
public:
inline static constexpr typename TArrayExtents::value_type lanes = Lanes;
using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;
inline static constexpr std::size_t blobCount = 1;

#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ >= 12
Expand Down Expand Up @@ -72,13 +72,12 @@ namespace llama::mapping
#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>;
const auto flatArrayIndex = LinearizeArrayDimsFunctor{}(ai, Base::extents());
const auto blockIndex = flatArrayIndex / Lanes;
const auto laneIndex = flatArrayIndex % Lanes;
const auto offset = static_cast<size_type>(sizeOf<TRecordDim> * Lanes) * blockIndex
+ static_cast<size_type>(flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, false>)
* Lanes
+ static_cast<size_type>(flatOffsetOf<typename Permuter::FlatRecordDim, flatFieldIndex, false>) * Lanes
+ static_cast<size_type>(sizeof(GetType<TRecordDim, RecordCoord<RecordCoords...>>)) * laneIndex;
return {0, offset};
}
Expand Down
15 changes: 8 additions & 7 deletions include/llama/mapping/BitPackedFloat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ namespace llama::mapping
typename ExponentBits = typename TArrayExtents::value_type,
typename MantissaBits = ExponentBits,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename TStoredIntegral = internal::StoredIntegralFor<TRecordDim>>
struct LLAMA_DECLSPEC_EMPTY_BASES BitPackedFloatAoS
: MappingBase<TArrayExtents, TRecordDim>
Expand All @@ -332,7 +332,7 @@ namespace llama::mapping
using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
using StoredIntegral = TStoredIntegral;

using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;
static constexpr std::size_t blobCount = 1;

LLAMA_FN_HOST_ACC_INLINE
Expand Down Expand Up @@ -382,7 +382,8 @@ namespace llama::mapping
RecordCoord<RecordCoords...>,
Blobs& blobs) const
{
constexpr auto flatFieldIndex = static_cast<size_type>(Flattener::template flatIndex<RecordCoords...>);
constexpr auto flatFieldIndex = static_cast<size_type>(
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>);
const auto bitOffset = ((TLinearizeArrayDimsFunctor{}(ai, Base::extents())
* static_cast<size_type>(flatFieldCount<TRecordDim>))
+ flatFieldIndex)
Expand All @@ -404,7 +405,7 @@ namespace llama::mapping
typename ExponentBits = unsigned,
typename MantissaBits = ExponentBits,
typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename StoredIntegral = void>
struct BindBitPackedFloatAoS
{
Expand All @@ -415,7 +416,7 @@ namespace llama::mapping
ExponentBits,
MantissaBits,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
std::conditional_t<
!std::is_void_v<StoredIntegral>,
StoredIntegral,
Expand All @@ -432,15 +433,15 @@ namespace llama::mapping
typename MantissaBits,
typename LinearizeArrayDimsFunctor,
template<typename>
typename FlattenRecordDim,
typename PermuteFields,
typename StoredIntegral>
inline constexpr bool isBitPackedFloatAoS<BitPackedFloatAoS<
ArrayExtents,
RecordDim,
ExponentBits,
MantissaBits,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
StoredIntegral>>
= true;
} // namespace llama::mapping
21 changes: 11 additions & 10 deletions include/llama/mapping/BitPackedInt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,9 @@ namespace llama::mapping
/// numbers will be read back positive.
/// \tparam TLinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
/// how big the linear domain gets.
/// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
// \ref FlattenRecordDimMinimizePadding.
/// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref
// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and
// \ref PermuteFieldsMinimizePadding.
/// \tparam TStoredIntegral Integral type used as storage of reduced precision integers. Must be std::uint32_t or
/// std::uint64_t.
template<
Expand All @@ -460,7 +460,7 @@ namespace llama::mapping
typename Bits = typename TArrayExtents::value_type,
SignBit SignBit = SignBit::Keep,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename TStoredIntegral = internal::StoredUnsignedFor<TRecordDim>>
struct BitPackedIntAoS
: internal::
Expand All @@ -475,7 +475,7 @@ namespace llama::mapping
using typename Base::size_type;
using VHBits = typename Base::VHBits; // use plain using declaration with nvcc >= 11.8

using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<TRecordDim>;
static constexpr std::size_t blobCount = 1;

LLAMA_FN_HOST_ACC_INLINE
Expand All @@ -493,7 +493,8 @@ namespace llama::mapping
RecordCoord<RecordCoords...>,
Blobs& blobs) const
{
constexpr auto flatFieldIndex = static_cast<size_type>(Flattener::template flatIndex<RecordCoords...>);
constexpr auto flatFieldIndex = static_cast<size_type>(
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>);
const auto bitOffset = ((TLinearizeArrayDimsFunctor{}(ai, Base::extents())
* static_cast<size_type>(flatFieldCount<TRecordDim>))
+ flatFieldIndex)
Expand All @@ -516,7 +517,7 @@ namespace llama::mapping
typename Bits = void,
SignBit SignBit = SignBit::Keep,
typename LinearizeArrayDimsFunctor = mapping::LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename StoredIntegral = void>
struct BindBitPackedIntAoS
{
Expand All @@ -527,7 +528,7 @@ namespace llama::mapping
std::conditional_t<!std::is_void_v<Bits>, Bits, typename ArrayExtents::value_type>,
SignBit,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
std::conditional_t<
!std::is_void_v<StoredIntegral>,
StoredIntegral,
Expand All @@ -544,15 +545,15 @@ namespace llama::mapping
SignBit SignBit,
typename LinearizeArrayDimsFunctor,
template<typename>
typename FlattenRecordDim,
typename PermuteFields,
typename StoredIntegral>
inline constexpr bool isBitPackedIntAoS<BitPackedIntAoS<
ArrayExtents,
RecordDim,
Bits,
SignBit,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
StoredIntegral>>
= true;
} // namespace llama::mapping
Loading

0 comments on commit 3dc3665

Please sign in to comment.