Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport to branch/2.3.x] Rework our system header approach to be more error proof (#661) #675

Merged
merged 1 commit into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_adjacent_difference.cuh>
#include <cub/block/block_load.cuh>
Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_exchange.cuh>
Expand Down
46 changes: 24 additions & 22 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_load.cuh>
#include <cub/grid/grid_queue.cuh>
Expand Down Expand Up @@ -266,18 +268,18 @@ struct AgentHistogram
struct _TempStorage
{
// Smem needed for block-privatized smem histogram (with 1 word of padding)
CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];
CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];

int tile_idx;

// Aliasable storage layout
union Aliasable
{
// Smem needed for loading a tile of samples
typename BlockLoadSampleT::TempStorage sample_load;
typename BlockLoadSampleT::TempStorage sample_load;

// Smem needed for loading a tile of pixels
typename BlockLoadPixelT::TempStorage pixel_load;
typename BlockLoadPixelT::TempStorage pixel_load;

// Smem needed for loading a tile of vecs
typename BlockLoadVecT::TempStorage vec_load;
Expand Down Expand Up @@ -650,7 +652,7 @@ struct AgentHistogram

/**
* @brief Consume a tile of data samples
*
*
* @tparam IS_ALIGNED
* Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
*
Expand Down Expand Up @@ -691,17 +693,17 @@ struct AgentHistogram

/**
* @brief Consume row tiles. Specialized for work-stealing from queue
*
* @param num_row_pixels
* The number of multi-channel pixels per row in the region of interest
*
* @param num_rows
* @param num_row_pixels
* The number of multi-channel pixels per row in the region of interest
*
* @param num_rows
* The number of rows in the region of interest
*
* @param row_stride_samples
* @param row_stride_samples
* The number of samples between starts of consecutive rows in the region of interest
*
* @param tiles_per_row
* @param tiles_per_row
* Number of image tiles per row
*/
template <bool IS_ALIGNED>
Expand Down Expand Up @@ -752,17 +754,17 @@ struct AgentHistogram

/**
* @brief Consume row tiles. Specialized for even-share (striped across thread blocks)
*
* @param num_row_pixels
*
* @param num_row_pixels
* The number of multi-channel pixels per row in the region of interest
*
* @param num_rows
* @param num_rows
* The number of rows in the region of interest
*
* @param row_stride_samples
* @param row_stride_samples
* The number of samples between starts of consecutive rows in the region of interest
*
* @param tiles_per_row
* @param tiles_per_row
* Number of image tiles per row
*/
template <bool IS_ALIGNED>
Expand Down Expand Up @@ -829,10 +831,10 @@ struct AgentHistogram
/**
* @brief Constructor
*
* @param temp_storage
* @param temp_storage
* Reference to temp_storage
*
* @param d_samples
* @param d_samples
* Input data to reduce
*
* @param num_output_bins
Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_load.cuh>
#include <cub/block/block_merge_sort.cuh>
Expand Down
14 changes: 8 additions & 6 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_exchange.cuh>
#include <cub/block/block_load.cuh>
Expand Down Expand Up @@ -131,7 +133,7 @@ struct AgentRadixSortDownsweepPolicy : ScalingType


/**
* @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in
* @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in
* device-wide radix sort downsweep .
*
* @tparam AgentRadixSortDownsweepPolicy
Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_load.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_radix_rank.cuh>
#include <cub/block/block_store.cuh>
Expand Down
14 changes: 8 additions & 6 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/block/block_load.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
Expand Down Expand Up @@ -510,7 +512,7 @@ struct AgentRadixSortUpsweep
* @brief Extract counts
*
* @param[out] bin_count
* The exclusive prefix sum for the digits
* The exclusive prefix sum for the digits
* [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
*/
template <int BINS_TRACKED_PER_THREAD>
Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <iterator>

Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_discontinuity.cuh>
Expand Down
42 changes: 22 additions & 20 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

#include <cub/config.cuh>

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_discontinuity.cuh>
Expand Down Expand Up @@ -130,7 +132,7 @@ struct AgentRlePolicy
******************************************************************************/

/**
* @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode
* @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode
*
* @tparam AgentRlePolicyT
* Parameterized AgentRlePolicyT tuning policy type
Expand Down Expand Up @@ -329,22 +331,22 @@ struct AgentRle
//---------------------------------------------------------------------

/**
* @param[in] temp_storage
* @param[in] temp_storage
* Reference to temp_storage
*
* @param[in] d_in
* @param[in] d_in
* Pointer to input sequence of data items
*
* @param[out] d_offsets_out
* @param[out] d_offsets_out
* Pointer to output sequence of run offsets
*
* @param[out] d_lengths_out
* @param[out] d_lengths_out
* Pointer to output sequence of run lengths
*
* @param[in] equality_op
* @param[in] equality_op
* Equality operator
*
* @param[in] num_items
* @param[in] num_items
* Total number of input items
*/
__device__ __forceinline__ AgentRle(TempStorage &temp_storage,
Expand Down Expand Up @@ -731,19 +733,19 @@ struct AgentRle
/**
* @brief Process a tile of input (dynamic chained scan)
*
* @param num_items
* @param num_items
* Total number of global input items
*
* @param num_remaining
* @param num_remaining
* Number of global input items remaining (including this tile)
*
* @param tile_idx
* @param tile_idx
* Tile index
*
* @param tile_offset
* @param tile_offset
* Tile offset
*
* @param &tile_status
* @param &tile_status
* Global list of tile status
*/
template <bool LAST_TILE>
Expand Down Expand Up @@ -953,13 +955,13 @@ struct AgentRle
/**
* @brief Scan tiles of items as part of a dynamic chained scan
*
* @param num_tiles
* @param num_tiles
* Total number of input tiles
*
* @param tile_status
* @param tile_status
* Global list of tile status
*
* @param d_num_runs_out
* @param d_num_runs_out
* Output pointer for total number of runs identified
*
* @tparam NumRunsIteratorT
Expand Down
Loading