Skip to content

Commit

Permalink
Internalize cuda/detail/core/util.h (#3505)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored Jan 30, 2025
1 parent 5ce5d28 commit a1a73a8
Show file tree
Hide file tree
Showing 16 changed files with 156 additions and 229 deletions.
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ template <typename Policy,
bool ReadLeft>
struct AgentDifference
{
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, InputIteratorT>::type;
using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, InputIteratorT>::type;

using BlockLoad = typename cub::BlockLoadType<Policy, LoadIt>::type;
using BlockStore = typename cub::BlockStoreType<Policy, OutputIteratorT, OutputT>::type;
Expand Down Expand Up @@ -119,7 +119,7 @@ struct AgentDifference
OffsetT num_items)
: temp_storage(temp_storage.Alias())
, input_it(input_it)
, load_it(THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it))
, load_it(THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(Policy(), input_it))
, first_tile_previous(first_tile_previous)
, result(result)
, difference_op(difference_op)
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ struct agent_t
using key_type = typename ::cuda::std::iterator_traits<KeysIt1>::value_type;
using item_type = typename ::cuda::std::iterator_traits<ItemsIt1>::value_type;

using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ItemsIt2>::type;
using keys_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt1>::type;
using keys_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeysIt2>::type;
using items_load_it1 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt1>::type;
using items_load_it2 = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ItemsIt2>::type;

using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
Expand Down
15 changes: 9 additions & 6 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ struct AgentBlockSort

using BlockMergeSortT = BlockMergeSort<KeyT, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, ValueT>;

using KeysLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyInputIteratorT>::type;
using ItemsLoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueInputIteratorT>::type;
using KeysLoadIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyInputIteratorT>::type;
using ItemsLoadIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueInputIteratorT>::type;

using BlockLoadKeys = typename cub::BlockLoadType<Policy, KeysLoadIt>::type;
using BlockLoadItems = typename cub::BlockLoadType<Policy, ItemsLoadIt>::type;
Expand Down Expand Up @@ -438,10 +440,11 @@ struct AgentMerge
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyIteratorT>::type;
using ItemsLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueIteratorT>::type;
using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, KeyT*>::type;
using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<Policy, ValueT*>::type;
using KeysLoadPingIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyIteratorT>::type;
using ItemsLoadPingIt =
typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueIteratorT>::type;
using KeysLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, KeyT*>::type;
using ItemsLoadPongIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<Policy, ValueT*>::type;

using KeysOutputPongIt = KeyIteratorT;
using ItemsOutputPongIt = ValueIteratorT;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_sub_warp_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@ public:

using WarpMergeSortT = WarpMergeSort<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::WARP_THREADS, ValueT>;

using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const KeyT*>::type;
using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator<PolicyT, const ValueT*>::type;
using KeysLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const KeyT*>::type;
using ItemsLoadItT = typename THRUST_NS_QUALIFIER::cuda_cub::core::detail::LoadIterator<PolicyT, const ValueT*>::type;

using WarpLoadKeysT = cub::WarpLoad<KeyT, PolicyT::ITEMS_PER_THREAD, PolicyT::LOAD_ALGORITHM, PolicyT::WARP_THREADS>;
using WarpLoadItemsT =
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ __launch_bounds__(
CompareOp>::type;
using MergePolicy = typename MergeAgent::policy;

using THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator;
using THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator;
using vsmem_helper_t = vsmem_helper_impl<MergeAgent>;
__shared__ typename vsmem_helper_t::static_temp_storage_t shared_temp_storage;
auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
Expand Down
19 changes: 10 additions & 9 deletions cub/cub/device/dispatch/kernels/merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@

THRUST_NAMESPACE_BEGIN

namespace cuda_cub::core
namespace cuda_cub::core::detail
{
// We must forward declare here because make_load_iterator.h pulls in non NVRTC compilable code
template <class PtxPlan, class It>
typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_load_iterator(PtxPlan const&, It it);
} // namespace cuda_cub::core
typename detail::LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
make_load_iterator(PtxPlan const&, It it);
} // namespace cuda_cub::core::detail

THRUST_NAMESPACE_END

Expand Down Expand Up @@ -196,8 +197,8 @@ __launch_bounds__(
AgentBlockSortT agent(
ping,
temp_storage,
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_in),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_in),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_in),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_in),
keys_count,
keys_out,
items_out,
Expand Down Expand Up @@ -302,10 +303,10 @@ __launch_bounds__(
AgentMergeT agent(
ping,
temp_storage,
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), keys_pong),
THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(ActivePolicyT(), items_pong),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_ping),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), keys_pong),
THRUST_NS_QUALIFIER::cuda_cub::core::detail::make_load_iterator(ActivePolicyT(), items_pong),
keys_count,
keys_pong,
items_pong,
Expand Down
46 changes: 12 additions & 34 deletions thrust/thrust/system/cuda/detail/core/agent_launcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ namespace cuda_cub
{
namespace core
{

namespace detail
{
# ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES
# define THRUST_DETAIL_KERNEL_ATTRIBUTES CCCL_DETAIL_KERNEL_ATTRIBUTES
# endif
Expand Down Expand Up @@ -97,7 +98,7 @@ THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args)
template <class Agent>
struct AgentLauncher : Agent
{
core::AgentPlan plan;
AgentPlan plan;
size_t count;
cudaStream_t stream;
char const* name;
Expand All @@ -121,7 +122,7 @@ struct AgentLauncher : Agent
, name(name_)
, grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
, vshmem(nullptr)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(count > 0);
Expand All @@ -136,7 +137,7 @@ struct AgentLauncher : Agent
, name(name_)
, grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile))
, vshmem(vshmem)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(count > 0);
Expand All @@ -149,7 +150,7 @@ struct AgentLauncher : Agent
, name(name_)
, grid(plan.grid_size)
, vshmem(nullptr)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(plan.grid_size > 0);
Expand All @@ -162,43 +163,19 @@ struct AgentLauncher : Agent
, name(name_)
, grid(plan.grid_size)
, vshmem(vshmem)
, has_shmem((size_t) core::get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, has_shmem((size_t) get_max_shared_memory_per_block() >= (size_t) plan.shared_memory_size)
, shmem_size(has_shmem ? plan.shared_memory_size : 0)
{
assert(plan.grid_size > 0);
}

# if 0
THRUST_RUNTIME_FUNCTION
AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
{
// in separable compilation mode, we have no choice
// but to call kernel to get agent_plan
// otherwise the risk is something may fail
// if user mix & match ptx versions in a separably compiled function
// http://nvbugs/1772071
// XXX may be it is too string of a requirements, consider relaxing it in
// the future
# ifdef __CUDACC_RDC__
return core::get_agent_plan<Agent>(s, d_ptr);
# else
return get_agent_plan<Agent>(core::get_ptx_version());
# endif
}
THRUST_RUNTIME_FUNCTION
AgentPlan static get_plan_default()
{
return get_agent_plan<Agent>(sm_arch<0>::type::ver);
}
# endif

THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
THRUST_RUNTIME_FUNCTION typename get_plan<Agent>::type static get_plan(cudaStream_t, void* d_ptr = 0)
{
THRUST_UNUSED_VAR(d_ptr);
return get_agent_plan<Agent>(core::get_ptx_version());
return get_agent_plan<Agent>(get_ptx_version());
}

THRUST_RUNTIME_FUNCTION typename core::get_plan<Agent>::type static get_plan()
THRUST_RUNTIME_FUNCTION typename detail::get_plan<Agent>::type static get_plan()
{
return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
}
Expand Down Expand Up @@ -227,7 +204,7 @@ struct AgentLauncher : Agent
{
# if THRUST_DEBUG_SYNC_FLAG
cuda_optional<int> occ = max_sm_occupancy(k);
const int ptx_version = core::get_ptx_version();
const int ptx_version = get_ptx_version();
if (count > 0)
{
_CubLog(
Expand Down Expand Up @@ -305,6 +282,7 @@ struct AgentLauncher : Agent
}
};

} // namespace detail
} // namespace core
} // namespace cuda_cub

Expand Down
4 changes: 2 additions & 2 deletions thrust/thrust/system/cuda/detail/core/load_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

THRUST_NAMESPACE_BEGIN

namespace cuda_cub::core
namespace cuda_cub::core::detail
{

// LoadIterator
Expand All @@ -52,6 +52,6 @@ struct LoadIterator
cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER, value_type, size_type>,
It>;
}; // struct Iterator
} // namespace cuda_cub::core
} // namespace cuda_cub::core::detail

THRUST_NAMESPACE_END
4 changes: 2 additions & 2 deletions thrust/thrust/system/cuda/detail/core/make_load_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

THRUST_NAMESPACE_BEGIN

namespace cuda_cub::core
namespace cuda_cub::core::detail
{
template <class PtxPlan, class It>
typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE
Expand All @@ -55,6 +55,6 @@ typename LoadIterator<PtxPlan, It>::type _CCCL_DEVICE _CCCL_FORCEINLINE make_loa
return make_load_iterator_impl<PtxPlan>(it, typename is_contiguous_iterator<It>::type());
}

} // namespace cuda_cub::core
} // namespace cuda_cub::core::detail

THRUST_NAMESPACE_END
51 changes: 4 additions & 47 deletions thrust/thrust/system/cuda/detail/core/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ namespace core
# endif
#endif

namespace detail
{
/// Typelist - a container of types
template <typename...>
struct typelist;
Expand Down Expand Up @@ -458,22 +460,9 @@ THRUST_RUNTIME_FUNCTION inline size_t get_max_shared_memory_per_block()
return static_cast<size_t>(i32value);
}

THRUST_RUNTIME_FUNCTION inline size_t virtual_shmem_size(size_t shmem_per_block)
{
size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
if (shmem_per_block > max_shmem_per_block)
{
return shmem_per_block;
}
else
{
return 0;
}
}

THRUST_RUNTIME_FUNCTION inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
{
size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
size_t max_shmem_per_block = get_max_shared_memory_per_block();
if (shmem_per_block > max_shmem_per_block)
{
return shmem_per_block * num_blocks;
Expand Down Expand Up @@ -509,22 +498,6 @@ struct BlockLoad
get_arch<PtxPlan>::type::ver>;
};

// BlockStore
// -----------
// a helper metaprogram that returns type of a block loader
template <class PtxPlan, class It, class T = typename iterator_traits<It>::value_type>
struct BlockStore
{
using type =
cub::BlockStore<T,
PtxPlan::BLOCK_THREADS,
PtxPlan::ITEMS_PER_THREAD,
PtxPlan::STORE_ALGORITHM,
1,
1,
get_arch<PtxPlan>::type::ver>;
};

// cuda_optional
// --------------
// used for function that return cudaError_t along with the result
Expand Down Expand Up @@ -619,16 +592,6 @@ THRUST_RUNTIME_FUNCTION inline int get_ptx_version()
return ptx_version;
}

THRUST_RUNTIME_FUNCTION inline cudaError_t sync_stream(cudaStream_t stream)
{
return cub::SyncStream(stream);
}

inline void _CCCL_DEVICE sync_threadblock()
{
__syncthreads();
}

// Deprecated [Since 2.8]
#define CUDA_CUB_RET_IF_FAIL(e) \
{ \
Expand Down Expand Up @@ -719,11 +682,6 @@ struct uninitialized_array
}
};

_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t align_to(size_t n, size_t align)
{
return ((n + align - 1) / align) * align;
}

namespace host
{
inline cuda_optional<size_t> get_max_shared_memory_per_block()
Expand Down Expand Up @@ -753,9 +711,8 @@ THRUST_RUNTIME_FUNCTION cudaError_t alias_storage(
return cub::AliasTemporaries(storage_ptr, storage_size, allocations, allocation_sizes);
}

} // namespace detail
} // namespace core
using core::sm52;
using core::sm60;
} // namespace cuda_cub

THRUST_NAMESPACE_END
Loading

0 comments on commit a1a73a8

Please sign in to comment.