Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate record dim flattening from field permutations #782

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- added macros `LLAMA_FORCE_INLINE` and `LLAMA_HOST_ACC` #366
- support clang as CUDA compiler #366
- `llama::mapping::SoA` and `llama::mapping::AoSoA` now support custom record dimension flatteners #371
- added the `llama::mapping::FlattenRecordDimIncreasingAlignment`, `llama::mapping::FlattenRecordDimDecreasingAlignment` and `llama::mapping::FlattenRecordDimMinimizePadding` record dimension flatteners #371
- added the `llama::mapping::PermuteFieldsIncreasingAlignment`, `llama::mapping::PermuteFieldsDecreasingAlignment` and `llama::mapping::PermuteFieldsMinimizePadding` record dimension flatteners #371
- added new mapping `llama::mapping::BitPackedIntSoA` bitpacking integers in the record dimension into SoA arrays, and added new example #372, #427, #441, #446
- added new mapping `llama::mapping::BitPackedFloatSoA` bitpacking floating-point types in the record dimension into SoA arrays, and added new example #414, #427, #446
- `LLAMA_FORCE_INLINE` views can be created on `const` blobs #375
Expand Down
14 changes: 7 additions & 7 deletions docs/pages/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,15 @@ Acessors
.. doxygenstruct:: llama::accessor::Restrict
.. doxygenstruct:: llama::accessor::Atomic
.. doxygenstruct:: llama::accessor::Stacked
RecordDim field permuters
^^^^^^^^^^^^^^^^^^^^^^^^^

RecordDim flattener
^^^^^^^^^^^^^^^^^^^

.. doxygenstruct:: llama::mapping::FlattenRecordDimInOrder
.. doxygenstruct:: llama::mapping::FlattenRecordDimSorted
.. doxygentypedef:: llama::mapping::FlattenRecordDimIncreasingAlignment
.. doxygentypedef:: llama::mapping::FlattenRecordDimDecreasingAlignment
.. doxygentypedef:: llama::mapping::FlattenRecordDimMinimizePadding
.. doxygenstruct:: llama::mapping::PermuteFieldsInOrder
.. doxygenstruct:: llama::mapping::PermuteFieldsSorted
.. doxygentypedef:: llama::mapping::PermuteFieldsIncreasingAlignment
.. doxygentypedef:: llama::mapping::PermuteFieldsDecreasingAlignment
.. doxygentypedef:: llama::mapping::PermuteFieldsMinimizePadding

Common utilities
^^^^^^^^^^^^^^^^
Expand Down
8 changes: 4 additions & 4 deletions examples/cuda/pitch/pitch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@ namespace llamaex
typename TArrayExtents,
typename TRecordDim,
bool AlignAndPad = true,
template<typename> typename FlattenRecordDim = mapping::FlattenRecordDimInOrder>
template<typename> typename PermuteFields = mapping::PermuteFieldsInOrder>
struct PitchedAoS : mapping::MappingBase<TArrayExtents, TRecordDim>
{
private:
static constexpr std::size_t dim = TArrayExtents{}.size();

using Base = mapping::MappingBase<TArrayExtents, TRecordDim>;
using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;

Array<std::size_t, dim> pitches;

Expand Down Expand Up @@ -116,9 +116,9 @@ namespace llamaex
#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>;
const auto offset
= dot(pitches, ai) + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
= dot(pitches, ai) + flatOffsetOf<typename Permuter::FlatRecordDim, flatFieldIndex, AlignAndPad>;
return {0, offset};
}
};
Expand Down
2 changes: 1 addition & 1 deletion examples/memmap/memmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ auto main(int argc, const char* argv[]) -> int
Triangle,
llama::mapping::FieldAlignment::Pack,
llama::mapping::LinearizeArrayDimsCpp,
llama::mapping::FlattenRecordDimInOrder>{{n}};
llama::mapping::PermuteFieldsInOrder>{{n}};
if(size != 80u + 4u + mapping.blobSize(0))
{
std::cout << "File size (" << size << ") != 80 + 4 + mapping size: (" << mapping.blobSize(0) << ")\n";
Expand Down
4 changes: 2 additions & 2 deletions include/llama/Simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ namespace llama
{
static_assert(mapping::isAoS<Mapping>);
static constexpr auto srcStride = flatSizeOf<
typename Mapping::Flattener::FlatRecordDim,
typename Mapping::Permuter::FlatRecordDim,
Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
const auto* srcBaseAddr = reinterpret_cast<const std::byte*>(&srcRef(rc));
ElementSimd elemSimd; // g++-12 really needs the intermediate elemSimd and memcpy
Expand Down Expand Up @@ -235,7 +235,7 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
static constexpr auto stride = flatSizeOf<
typename Mapping::Flattener::FlatRecordDim,
typename Mapping::Permuter::FlatRecordDim,
Mapping::fieldAlignment == llama::mapping::FieldAlignment::Align>;
auto* dstBaseAddr = reinterpret_cast<std::byte*>(&dstRef(rc));
const ElementSimd elemSimd = srcSimd(rc);
Expand Down
31 changes: 13 additions & 18 deletions include/llama/mapping/AoS.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ namespace llama::mapping
/// If Pack, struct members are tightly packed.
/// \tparam TLinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
/// how big the linear domain gets.
/// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
/// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
/// \ref FlattenRecordDimMinimizePadding.
/// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref
/// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and
/// \ref PermuteFieldsMinimizePadding.
template<
typename TArrayExtents,
typename TRecordDim,
FieldAlignment TFieldAlignment = FieldAlignment::Align,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
template<typename> typename PermuteFields = PermuteFieldsInOrder>
struct AoS : MappingBase<TArrayExtents, TRecordDim>
{
private:
Expand All @@ -30,15 +30,15 @@ namespace llama::mapping
public:
inline static constexpr FieldAlignment fieldAlignment = TFieldAlignment;
using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;
inline static constexpr std::size_t blobCount = 1;

using Base::Base;

LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(size_type) const -> size_type
{
return LinearizeArrayDimsFunctor{}.size(Base::extents())
* flatSizeOf<typename Flattener::FlatRecordDim, fieldAlignment == FieldAlignment::Align>;
* flatSizeOf<typename Permuter::FlatRecordDim, fieldAlignment == FieldAlignment::Align>;
}

template<std::size_t... RecordCoords>
Expand All @@ -50,13 +50,13 @@ namespace llama::mapping
#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>;
const auto offset
= LinearizeArrayDimsFunctor{}(ai, Base::extents())
* static_cast<size_type>(
flatSizeOf<typename Flattener::FlatRecordDim, fieldAlignment == FieldAlignment::Align>)
flatSizeOf<typename Permuter::FlatRecordDim, fieldAlignment == FieldAlignment::Align>)
+ static_cast<size_type>(flatOffsetOf<
typename Flattener::FlatRecordDim,
typename Permuter::FlatRecordDim,
flatFieldIndex,
fieldAlignment == FieldAlignment::Align>);
return {size_type{0}, offset};
Expand All @@ -75,12 +75,8 @@ namespace llama::mapping
/// Array of struct mapping preserving the alignment of the field types by inserting padding and permuting the
/// field order to minimize this padding. \see AoS
template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
using MinAlignedAoS = AoS<
ArrayExtents,
RecordDim,
FieldAlignment::Align,
LinearizeArrayDimsFunctor,
FlattenRecordDimMinimizePadding>;
using MinAlignedAoS
= AoS<ArrayExtents, RecordDim, FieldAlignment::Align, LinearizeArrayDimsFunctor, PermuteFieldsMinimizePadding>;

/// Array of struct mapping packing the field types tightly, violating the type's alignment requirements.
/// \see AoS
Expand All @@ -107,8 +103,7 @@ namespace llama::mapping
FieldAlignment FieldAlignment,
typename LinearizeArrayDimsFunctor,
template<typename>
typename FlattenRecordDim>
inline constexpr bool
isAoS<AoS<ArrayExtents, RecordDim, FieldAlignment, LinearizeArrayDimsFunctor, FlattenRecordDim>>
typename PermuteFields>
inline constexpr bool isAoS<AoS<ArrayExtents, RecordDim, FieldAlignment, LinearizeArrayDimsFunctor, PermuteFields>>
= true;
} // namespace llama::mapping
15 changes: 7 additions & 8 deletions include/llama/mapping/AoSoA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ namespace llama::mapping

/// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView.
/// \tparam Lanes The size of the inner arrays of this array of struct of arrays.
/// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
/// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
/// \ref FlattenRecordDimMinimizePadding.
/// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref
/// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and
/// \ref PermuteFieldsMinimizePadding.
template<
typename TArrayExtents,
typename TRecordDim,
typename TArrayExtents::value_type Lanes,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
template<typename> typename PermuteFields = PermuteFieldsInOrder>
struct AoSoA : MappingBase<TArrayExtents, TRecordDim>
{
private:
Expand All @@ -44,7 +44,7 @@ namespace llama::mapping
public:
inline static constexpr typename TArrayExtents::value_type lanes = Lanes;
using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;
inline static constexpr std::size_t blobCount = 1;

#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ >= 12
Expand Down Expand Up @@ -72,13 +72,12 @@ namespace llama::mapping
#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 6
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>;
const auto flatArrayIndex = LinearizeArrayDimsFunctor{}(ai, Base::extents());
const auto blockIndex = flatArrayIndex / Lanes;
const auto laneIndex = flatArrayIndex % Lanes;
const auto offset = static_cast<size_type>(sizeOf<TRecordDim> * Lanes) * blockIndex
+ static_cast<size_type>(flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, false>)
* Lanes
+ static_cast<size_type>(flatOffsetOf<typename Permuter::FlatRecordDim, flatFieldIndex, false>) * Lanes
+ static_cast<size_type>(sizeof(GetType<TRecordDim, RecordCoord<RecordCoords...>>)) * laneIndex;
return {0, offset};
}
Expand Down
15 changes: 8 additions & 7 deletions include/llama/mapping/BitPackedFloat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ namespace llama::mapping
typename ExponentBits = typename TArrayExtents::value_type,
typename MantissaBits = ExponentBits,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename TStoredIntegral = internal::StoredIntegralFor<TRecordDim>>
struct LLAMA_DECLSPEC_EMPTY_BASES BitPackedFloatAoS
: MappingBase<TArrayExtents, TRecordDim>
Expand All @@ -332,7 +332,7 @@ namespace llama::mapping
using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
using StoredIntegral = TStoredIntegral;

using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<FlatRecordDim<TRecordDim>>;
static constexpr std::size_t blobCount = 1;

LLAMA_FN_HOST_ACC_INLINE
Expand Down Expand Up @@ -382,7 +382,8 @@ namespace llama::mapping
RecordCoord<RecordCoords...>,
Blobs& blobs) const
{
constexpr auto flatFieldIndex = static_cast<size_type>(Flattener::template flatIndex<RecordCoords...>);
constexpr auto flatFieldIndex = static_cast<size_type>(
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>);
const auto bitOffset = ((TLinearizeArrayDimsFunctor{}(ai, Base::extents())
* static_cast<size_type>(flatFieldCount<TRecordDim>))
+ flatFieldIndex)
Expand All @@ -404,7 +405,7 @@ namespace llama::mapping
typename ExponentBits = unsigned,
typename MantissaBits = ExponentBits,
typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename StoredIntegral = void>
struct BindBitPackedFloatAoS
{
Expand All @@ -415,7 +416,7 @@ namespace llama::mapping
ExponentBits,
MantissaBits,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
std::conditional_t<
!std::is_void_v<StoredIntegral>,
StoredIntegral,
Expand All @@ -432,15 +433,15 @@ namespace llama::mapping
typename MantissaBits,
typename LinearizeArrayDimsFunctor,
template<typename>
typename FlattenRecordDim,
typename PermuteFields,
typename StoredIntegral>
inline constexpr bool isBitPackedFloatAoS<BitPackedFloatAoS<
ArrayExtents,
RecordDim,
ExponentBits,
MantissaBits,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
StoredIntegral>>
= true;
} // namespace llama::mapping
21 changes: 11 additions & 10 deletions include/llama/mapping/BitPackedInt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,9 @@ namespace llama::mapping
/// numbers will be read back positive.
/// \tparam TLinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
/// how big the linear domain gets.
/// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
// \ref FlattenRecordDimMinimizePadding.
/// \tparam PermuteFields Defines how the record dimension's fields should be permuted. See \ref
// PermuteFieldsInOrder, \ref PermuteFieldsIncreasingAlignment, \ref PermuteFieldsDecreasingAlignment and
// \ref PermuteFieldsMinimizePadding.
/// \tparam TStoredIntegral Integral type used as storage of reduced precision integers. Must be std::uint32_t or
/// std::uint64_t.
template<
Expand All @@ -460,7 +460,7 @@ namespace llama::mapping
typename Bits = typename TArrayExtents::value_type,
SignBit SignBit = SignBit::Keep,
typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename TStoredIntegral = internal::StoredUnsignedFor<TRecordDim>>
struct BitPackedIntAoS
: internal::
Expand All @@ -475,7 +475,7 @@ namespace llama::mapping
using typename Base::size_type;
using VHBits = typename Base::VHBits; // use plain using declaration with nvcc >= 11.8

using Flattener = FlattenRecordDim<TRecordDim>;
using Permuter = PermuteFields<TRecordDim>;
static constexpr std::size_t blobCount = 1;

LLAMA_FN_HOST_ACC_INLINE
Expand All @@ -493,7 +493,8 @@ namespace llama::mapping
RecordCoord<RecordCoords...>,
Blobs& blobs) const
{
constexpr auto flatFieldIndex = static_cast<size_type>(Flattener::template flatIndex<RecordCoords...>);
constexpr auto flatFieldIndex = static_cast<size_type>(
Permuter::template permute<flatRecordCoord<TRecordDim, RecordCoord<RecordCoords...>>>);
const auto bitOffset = ((TLinearizeArrayDimsFunctor{}(ai, Base::extents())
* static_cast<size_type>(flatFieldCount<TRecordDim>))
+ flatFieldIndex)
Expand All @@ -516,7 +517,7 @@ namespace llama::mapping
typename Bits = void,
SignBit SignBit = SignBit::Keep,
typename LinearizeArrayDimsFunctor = mapping::LinearizeArrayDimsCpp,
template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder,
template<typename> typename PermuteFields = PermuteFieldsInOrder,
typename StoredIntegral = void>
struct BindBitPackedIntAoS
{
Expand All @@ -527,7 +528,7 @@ namespace llama::mapping
std::conditional_t<!std::is_void_v<Bits>, Bits, typename ArrayExtents::value_type>,
SignBit,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
std::conditional_t<
!std::is_void_v<StoredIntegral>,
StoredIntegral,
Expand All @@ -544,15 +545,15 @@ namespace llama::mapping
SignBit SignBit,
typename LinearizeArrayDimsFunctor,
template<typename>
typename FlattenRecordDim,
typename PermuteFields,
typename StoredIntegral>
inline constexpr bool isBitPackedIntAoS<BitPackedIntAoS<
ArrayExtents,
RecordDim,
Bits,
SignBit,
LinearizeArrayDimsFunctor,
FlattenRecordDim,
PermuteFields,
StoredIntegral>>
= true;
} // namespace llama::mapping
Loading