diff --git a/fbgemm_gpu/codegen/genscript/jinja_environment.py b/fbgemm_gpu/codegen/genscript/jinja_environment.py index d682208989..51cf32cbd6 100644 --- a/fbgemm_gpu/codegen/genscript/jinja_environment.py +++ b/fbgemm_gpu/codegen/genscript/jinja_environment.py @@ -295,13 +295,7 @@ def has_experimental_support( Check if the given combination of configs has TBE v2 support - TBE v2 does not support dense, nobag, vbe, is_index_select, is_rocm, and ssd """ - return ( - not dense - and not nobag - and not vbe - and not is_index_select - and not ssd - ) + return not dense and not nobag and not vbe and not is_index_select and not ssd def is_valid_gwd_config( diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh index 9612e13198..ec21076b3e 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_prelude.cuh @@ -127,8 +127,9 @@ DEVICE_INLINE uint32_t ballot_sync( /// Sums a register value across all warp threads template -DEVICE_INLINE T -warpReduceAllSum(T val, unsigned shfl_sync_mask = static_cast(kFullWarpMask)) { +DEVICE_INLINE T warpReduceAllSum( + T val, + unsigned shfl_sync_mask = static_cast(kFullWarpMask)) { #pragma unroll for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) { val += shfl_xor(val, mask, ReduceWidth, shfl_sync_mask);