diff --git a/.zenodo.json b/.zenodo.json index f7a2af2a8684..e2988c35a4bd 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -7,6 +7,11 @@ "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", "orcid": "0000-0003-3396-6154" }, + { + "name": "Bastrakova, Kseniia", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0001-8970-5098" + }, { "name": "Bocci, Andrea", "affiliation": "CERN", @@ -22,15 +27,18 @@ "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", "orcid": "0000-0002-8218-3116" }, + { + "name": "Ferragina, Luca", + "affiliation": "CERN" + }, { "name": "Gruber, Bernhard Manfred", "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf, CERN", "orcid": "0000-0001-7848-1690" }, { - "name": "Huebl, Axel", - "affiliation": "Lawrence Berkeley National Laboratory", - "orcid": "0000-0003-1943-7141" + "name": "Kaever, Christian", + "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf" }, { "name": "Kelling, Jeffrey", @@ -38,28 +46,29 @@ "orcid": "0000-0003-1761-2591" }, { - "name": "Pantaleo, Felice", + "name": "Martin-Haugh, Stewart", + "affiliation": "STFC Rutherford Appleton Laboratory", + "orcid": "0000-0001-9457-1928" + }, + { + "name": "Perego, Aurora", "affiliation": "CERN", - "orcid": "0000-0003-3266-4357" + "orcid": "0000-0003-1576-6757" }, { "name": "Stephan, Jan", "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf", "orcid": "0000-0001-7839-4386" }, - { - "name": "Vyskočil, Jiří", - "affiliation":"CASUS, Helmholtz-Zentrum Dresden-Rossendorf", - "orcid": "0000-0001-8822-0929" - }, { "name": "Widera, René", "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", "orcid": "0000-0003-1642-0459" }, { - "name": "Worpitz, Benjamin", - "affiliation": "LogMeIn Inc." + "name": "Young, Jeffrey", + "affiliation": "Georgia Institute of Technology", + "orcid": "0000-0001-9841-4057" } ], "contributors": [ @@ -73,6 +82,12 @@ "affiliation": "TU Dresden", "type": "Other" }, + { + "name": "Hübl, Axel", + "affiliation": "Lawrence Berkeley National Laboratory", + "orcid": "0000-0003-1943-7141", + "type": "Other" + }, { "name": "Knespel, Maximilian", "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", @@ -99,6 +114,12 @@ "affiliation": "JetBrains", "type": "Other" }, + { + "name": "Pantaleo, Felice", + "affiliation": "CERN", + "orcid": "0000-0003-3266-4357", + "type": "Other" + }, { "name": "Rogers, David M.", "affiliation": "Oak Ridge National Laboratory", @@ -120,6 +141,12 @@ "affiliation": "Deutsches Zentrum für Luft- und Raumfahrt e.V.", "type": "Other" }, + { + "name": "Vyskočil, Jiří", + "affiliation": "CASUS, Helmholtz-Zentrum Dresden-Rossendorf", + "orcid": "0000-0001-8822-0929", + "type": "Other" + }, { "name": "Werner, Matthias", "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", @@ -130,6 +157,11 @@ "affiliation":"TU Dresden", "type": "Other" }, + { + "name": "Worpitz, Benjamin", + "affiliation": "LogMeIn Inc.", + "type": "Other" + }, { "name": "Zacharias, Malte", "affiliation": "Helmholtz-Zentrum Dresden-Rossendorf", diff --git a/CHANGELOG.md b/CHANGELOG.md index b4907b99e1ff..0d4ee9864d91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,284 @@ # Changelog All notable changes to this project will be documented in this file. -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). + +## [1.0.0] - 2023-11-14 + +### Added + +- `g++`: + - Added support for `g++-13` #1967 + - Added support for `g++-12` #1721 #1754 #1765 #1867 +- `clang++`: + - Added support for `clang-17` #2171 #2174 + - Added support for `clang-16` #1971 #2006 + - Added support for `clang-15` #1898 + - Added support for `clang-14` #1766 + - Added support for `clang-13` #1756 +- `icpx`: + - Added support for the Intel® oneAPI DPC++/C++ Compiler (`icpx`) #1700 #1706 #1884 #2064 #2081 +- Xcode: + - Added support for Xcode 14.3.1 #1973 + - Added support for Xcode 14.2 #1899 +- CUDA: + - Added support for CUDA 12.2 #2043 + - Added support for CUDA 12.1 #1957 + - Added support for CUDA 11.{6,7,8} and 12.0 #1890 +- ROCm: + - Added support for ROCm 5.5 #1961 + - Added support for ROCm 5.4 #1915 + - Added support for ROCm 5.3 #1838 + - Added support for ROCm 5.2.3 #1812 +- `alpaka::math`: + - Added `alpaka::math::copysign` function #2050 + - Added `alpaka::math::log2` and `alpaka::math::log10` functions #2029 + - Added `alpaka::math::fma` functions #2015 + - Added hyperbolic functions #1828 #2030 + - Added `constants` namespace which contains constants such as π, e, etc. #1710 +- `alpaka::Vec`: + - Added generator constructor #2085 + - Added `front` and `back` methods #2085 + - Added `elementwise_{min,max}` methods #1805 + - `Vec` now features a deduction guide for easier construction #1610 +- Documentation: + - Added example illustrating typical data-parallel patterns with alpaka #1712 + - Added documentation about the behaviour of `constexpr` functions in kernel code #1699 + - Added documentation about CUDA function attributes #1697 + - Added documentation about setting the C++ standard library for clang #1695 +- Test cases: + - Added test for `alpaka::ViewSubView` #2095 + - Added queue test which checks that a task is destroyed after execution #2047 + - Added test for `alpaka::getValidWorkDiv` with `Idx` type #1830 + - Added tests for `alpaka::subDivideGridElements` #1829 +- CI: + - Run test cases with `-Werror` #2163 + - Added UBSan CI job #2059 + - Added CI job to create amalgamated `alpaka.hpp` #1956 #1965 #1972 + - Made GitLab CI jobs interruptible #1904 + - Updated used Boost and CMake versions #1903 #1969 + - Added `agc-manager` support #1871 #1921 + - Added TSan CI job #1851 #2103 #2137 + - GitLab CI jobs are now automatically generated #1785 #1889 #1896 #1951 #1952 #2005 #2041 +- Upgraded to `clang-format-16` #2147 +- Added `alpaka::getPitchesInBytes` function which returns all pitches for a given view as an `alpaka::Vec` #2092 #2093 #2116 #2125 +- Added `alpaka::get{Extents,Offsets}` functions which return all extents/offsets for a given view as an `alpaka::Vec` #2080 +- Added `alpaka_DISABLE_VENDOR_RNG` CMake flag and its corresponding preprocessor macro `ALPAKA_DISABLE_VENDOR_RNG` to optionally disable vendor RNG libraries #2036 +- Added alpaka port of BabelStream #1846 #1934 +- Added utility functions `alpaka::core::{divCeil,intPow,nthRootFloor}` #1830 +- Added `operator==` for `alpaka::WorkDivMembers` #1829 +- Added `alpaka::is{Accelerator,Device,Platform,Queue}` variable templates #1818 +- Added accelerator tags which allow for accelerator-specific code paths without enabling the corresponding back-end #1804 #1814 +- Added experimental support for `std::mdspan` #1788 #2048 #2052 #2053 +- Added `alpaka::ViewConst` which wraps another view but prevents modifying accesses #1746 +- `alpaka::{memcpy,memset}` now support temporary destination views #1743 +- Host memory alignment can now be specified by using the `ALPAKA_DEFAULT_HOST_MEMORY_ALIGNMENT` macro #1686 +- Added `alpaka::allocMappedBuf` for allocating device-accessible pinned host memory #1685 #1782 #2162 + - Added related trait `alpaka::trait::hasMappedBufSupport` to query the host CPU for device-accessible pinned memory support #1782 + - Added related utility function `alpaka::allocMappedBufIfSupported` to allocate device-accessible pinned memory, if supported, and regular memory otherwise #1782 #2120 +- Relocatable device code can now be enabled using the `alpaka_RELOCATABLE_DEVICE_CODE` CMake option #1467 + +### Changed + +- API changes: + - **Breaking change**: `alpaka::get{Width,Height,Depth}` now always return `1` for unavailable dimensions instead of `static_assert`ing #2148 + - **Breaking change**: alpaka platforms have been renamed from `alpaka::Pltf*` to `alpaka::Platform*` #2024 #2032 + - **Breaking change**: alpaka platforms are now full objects instead of types #1988 #2051 #2165 + - `operator<<(std::ostream&, WorkDivMembers const&)` is now a `friend` of `alpaka::WorkDivMembers` instead of a method #1829 + - **Potentially breaking change**: Switched several view-related methods from `ALPAKA_FN_HOST_ACC` to `ALPAKA_FN_HOST` #1826 + - Accelerators' copy/move constructors and assignment operators are now explicitly `delete`d #1825 + - `alpaka::test::allocAsyncBufIfSupported` was moved into the general `namespace alpaka` #1782 + - Removed unnecessary attribute `ALPAKA_FN_HOST_ACC` from defaulted functions #1761 + - The `UniformCudaHip` types are now templated on traits-like `struct`s which encapsulate the CUDA or HIP API #1665 +- General behavioural changes: + - Improved handling of CMake generator expressions #2146 + - Improved detection of C++20 features #2138 + - Simplified internals of `alpaka_add_{executable,library}` #2072 #2082 + - **Breaking change**: Removed dummy atomics from memory fence implementations. Users now need to guarantee correctness themselves #2071 + - In debug mode MSVC will use the `/Od` optimization level #1977 + - In debug mode clang-based compilers will explicitly use the `-O0` optimization level #1977 + - In debug mode `g++` will use the `-Og` optimization level #1977 + - `-Werror` and its MSVC equivalent `/WX` are no longer enabled by default when `BUILD_TESTING` is set to `ON` #1977 + - A platform's internal `std::vector` containing the `alpaka::Device`s now reserves the necessary memory before initialization #1926 + - **Potentially breaking change**: `ALPAKA_FN_INLINE` now enforces inlining for platforms other than CUDA and HIP #1918 + - Replaced `alpaka::core::ConcurrentExecPool` with `alpaka::core::CallbackThread` in all queue implementations #1870 + - If no back-end is enabled, alpaka automatically selects the serial back-end for examples and test cases #1843 + - On Linux platforms, the free global memory is now determined by a call to `sysconf(_SC_AVPHYS_PAGES)` instead of querying `/proc/sysinfo` #1776 + - **Potentially breaking change**: Changed CMake's look-up of MSVC's runtime libraries (see [here](https://cmake.org/cmake/help/v3.22/policy/CMP0091.html) for an in-depth explanation) #1751 + - Unified `alpaka::{memcpy,memset}`'s internal `static_assert`s #1748 + - `alpaka::core::aligned{Alloc,Free}` now internally use aligned `new`/`delete` instead of OS-specific APIs #1689 +- CUDA/HIP back-end changes: + - `nvcc` now makes correct use of `--Werror` and more CUDA-related warnings #2135 + - Unified `ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK` macros #2090 + - Made some internal constants `constexpr` #2063 + - The CUDA/HIP back-ends will now always use `std::size_t` for internal pitch calculations #2056 + - **Breaking change**: clang as CUDA compiler will only work in `Release` build mode #2027 + - **Potentially breaking change**: In debug mode `ǹvcc` will now use the `-G` flag which enables device-side debug symbols #1977 + - Starting from HIP 5.2.0, the HIP back-end includes `` instead of `` #1914 + - Starting from HIP 5.2.0, the HIP back-end makes use of `hip{Malloc,Free}Async` #1894 + - If clang is used as CUDA compiler together with CUDA 11.3 a warning will be printed #1890 + - Starting from HIP 5.4.0, the HIP back-end internally uses `hipLaunchHostFunc` instead of a work-around #1883 + - Adapted to API changes in CUDA 11.7's stream memory operations #1878 #1919 + - Shortened mangled CUDA kernel names #1795 + - CUDA runtime versions checks are now based upon `CUDART_VERSION` instead of `BOOST_LANG_CUDA` #1777 + - Because of a HIP performance regression the HIP back-end now uses the emulated `atomicAdd(float)` on the `Threads` hierarchy level #1771 + - Changed look-up of built-in and emulated atomic functions for the CUDA and HIP back-ends #1768 + - The HIP back-end now uses the built-in `atomicAdd(double)` #1767 + - CUDA/HIP queues now internally make use of callback threads #1719 #1735 #1976 #2011 +- SYCL back-end changes: + - Removed unnecessary `-fintelfpga` flag from CMake build system when compiling the SYCL back-end for Intel FPGAs #2179 + - **Breaking change**: Support for the `activemask` intrinsic is disabled for the SYCL back-end #2161 + - Updated `README_SYCL.md` #2140 + - **Breaking change**: Reworked CMake handling for SYCL targets #1970 #2066 + - **Breaking change**: The SYCL back-end now accepts SYCL USM pointers as kernel parameters #1845 #2042 + - **Breaking change**: The SYCL CPU selector was generalized to both Intel and non-Intel CPUs and therefore renamed #1845 + - **Breaking change**: The SYCL back-end replaced `sycl::stream` with `printf` for device side printing #1845 #2045 + - The SYCL back-end now features a kernel trait which allows to set the SYCL sub-group (= warp) size #1845 + - The SYCL back-end now supports RNG through the Intel oneAPI libraries #1845 + - The SYCL back-end is now based upon the SYCL 2020 specification #1845 #1981 +- RNG changes: + - **Breaking change**: Philox RNG is now counter-based and stateless #1792 + - Philox random engines are now trivially copyable #1778 +- Documentation: + - Improved documentation of `ALPAKA_FN_INLINE` #2091 + - Reduced example work sizes #2084 + - Improved documentation of `alpaka::QueueCpuOmp2Collective` #2025 + - Clarified kernel and kernel argument requirements #1944 + - Replaced license headers with SPDX license identifiers #1917 + - Collapsed compiler support matrix in `README.md` #1860 +- Refactorings: + - Refactored test classes #2156 #2158 + - Use nested namespace specifiers #2152 + - Removed unnecessary member initialization calls #2151 + - Avoid unnecessary indentions #2149 + - Renamed internal variables of `ViewSubViewTest.cpp` and `ViewPlainPtrTest.cpp` to prevent name shadowing #2144 + - Refactored the internals of `alpaka::{mapIdx,mapIdxPitchBytes}` #2136 + - Replaced Codeplay's STLTuple implementation with `std::tuple` #2106 + - Replaced `ALPAKA_DECAY_T` macro with `std::decay_t` #2104 + - Refactored `alpaka::internal::ViewAccessOps` #2094 + - **Breaking change**: Replaced `alpaka::createVecFromIndexedFn` family of functions with `alpaka::Vec`'s new generator constructor #2085 + - Refactored `alpaka::QueueCpuOmp2Collective` #2013 + - Refactored `alpaka::meta::ndLoop` #1999 + - Refactored `alpaka::TaskKernelCpuThreads` #1998 + - Refactored `alpaka::core::ConcurrentExecPool` and related classes #1852 #2000 + - Refactored `alpaka::subDivideGridElements` #1830 + - Refactored includes inside `alpaka/dev/cpu/SysInfo.hpp` #1776 +- Test changes: + - Catch2 is no longer built with fast math enabled when using `icpx` as compiler #2128 + - `-pedantic` is no longer added when compiling CUDA code #2096 + - Reduced noise from `helloWorld`, `helloWorldLambda` and `TestTemplate` #2076 + - Renamed `fenceTest` to `FenceTest` #2037 + - The `Any` intrinsic unit test now assumes a sub-group size of `4` #2017 + - The `NativeHandleTest` no longer assumes that a native handle is an `int` #2008 + - Test cases are now compiled with MSVC's two phase lookup enabled #1986 + - Kernel names in the test cases are now demangled #1983 + - CUDA/HIP/SYCL atomic tests are now restricted to explicitly supported types #1980 + - Test cases are no longer executed for zero-dimensional SYCL accelerators #1979 + - Tests are disabled by default when using alpaka via CMake's `add_subdirectory` #1912 +- CI changes: + - Removed unused sanitizer blacklists #2154 + - Simplified CI oneTBB installation #2145 + - The GitLab CI now features runtime tests built with `g++` and `clang++` #2131 #2141 + - Upgraded ASan CI job to `clang-16` #2057 + - Upgraded special CUDA jobs to newer versions #2055 + - Re-enabled `g++-9` + CUDA jobs #2040 + - Updated Read the Docs configuration to v2 #2010 + - For ROCm versions <= 5.3 certain warnings are ignored #1932 + - Split compile and runtime CI runners into separate GitLab pipelines #1908 + - Switched more CI runners to C++20 mode #1902 + - LLVM sanitizer libraries are explicitly installed #1900 + - Re-enabled CUDA + `gcc-10` jobs #1890 + - Moved all GitHub jobs from `ubuntu-latest` to `ubuntu-20.04` #1872 + - More jobs are only compiling the test cases but no longer execute them #1869 + - CUDA CI runners no longer manually install the GPU driver #1853 + - Change ROCm CI node #1844 + - Reworked Xcode OpenMP installation #1840 #1922 + - Upgraded to GitHub checkout action v3 #1832 + - Upgraded test infrastructure to Catch2 v3 #1749 #1815 #1861 #1911 + - Upgraded headercheck CI run to clang-13 and CUDA 11.2 #1803 + - Simplified CI clang installation #1763 + - Running CI workflows are now automatically cancelled when their corresponding PRs are updated #1717 + +### Deprecated + +- **Breaking change**: deprecated `alpaka::getPitchBytes[Vec]` functions in favour of new `alpaka::getPitchesInBytes` function #2092 #2116 +- **Breaking change**: deprecated `alpaka::get{Extent,Offset}[Vec]` functions in favour of new `alpaka::get{Extents,Offsets}` functions #2080 #2139 + +### Removed + +- `g++`: + - Dropped support for `g++-{7,8}` #1872 +- `clang++`: + - Removed work-around for very old clang versions #1916 + - Dropped support for clang as CUDA compiler for all versions before `clang-14` #1890 + - Dropped support for `clang-{6,7,8,9}` #1872 + - Dropped support for `clang-5` #1750 +- `icpc`: + - Dropped support for the Intel® C++ Compiler Classic (`icpc`) #1702 +- MSVC: + - Temporarily dropped support for MSVC + CUDA due to a nvcc bug #1958 + - Dropped support for MSVC 2019 #1887 +- Xcode: + - Dropped support for Xcode 12.4.0 #1759 +- CUDA: + - Dropped support for CUDA 10 #1872 + - Dropped support for CUDA 9.2 #1855 +- ROCm: + - Dropped support for ROCm 4 #1886 +- SYCL: + - Removed Xilinx platform support #1970 +- Removed floating point contractions for math test cases #2155 +- Removed `alpaka::set{Extent,Offset}` functions #2087 +- Removed alpaka's experimental accessors #2054 #2062 +- Catch2 is no longer compiled with `CATCH_CONFIG_FAST_COMPILE` set to `ON` #1978 +- Removed OpenMP 5 back-end #1947 +- Removed OpenACC back-end #1941 +- Removed warning for Boost 1.73 since alpaka requires Boost >= 1.74 #1849 +- Removed previously deprecated `alpaka::time` functionality #1841 +- Removed `alpaka::{map,unmap,pin,unpin,isPinned,prepareForAsyncCopy}()` free functions #1790 +- Removed unused `alpaka::ConceptUniformCudaHip` #1736 +- Removed Boost.fiber back-end #1718 + +### Fixed + +- Fixed warnings uncovered by `nvcc` + `clang++ -Werror` #2157 #2159 #2164 #2167 +- Removed useless semicolon #2129 +- Fixed debug information for SYCL zero-dimensional buffer allocations #2127 +- Fixed missing `[[maybe_unused]]` inside `extent/Traits.hpp` #2122 +- Fixed several minor issues with the documentation #2121 #2176 +- Fixed unsigned integer conversion inside `ViewAccessOps.hpp` #2119 +- Fixed several warnings issued by `nvcc` #2118 +- Fixed compiler explorer link #2117 +- `alpaka::core::detail::ThreadPool` now handles a task's `noexcept` specifier correctly #2115 +- Fixed missing `` include in `BlockSyncBarrierOmp.hpp` #2114 +- Fixed integer conversions inside `memViewTest` #2113 +- Fixed `alpaka::BufUniformCudaHipRt` declarations sometimes being a `struct` and sometimes a `class` #2109 +- Fixed `alpaka::wait()` behaviour for events and devices #2108 +- Fixed `alpaka::ViewPlainPtr` not being copyable and moveable #2105 +- **Potentially breaking change**: Fixed `alpaka::core::{CallbackThread,ThreadPool}` not propagatinc exceptions #2067 +- Fixed missing `ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK` calls in debug mode #2034 +- Worked around Catch2 macros not being thread-safe #2022 +- Fixed `alpaka::test::KernelExecutionFixture`'s delegating constructor #2021 +- Fixed missing `` include in `alpaka/rand/Traits.hpp` #1977 +- Fixed ill-formed spelling of `alpaka::EventUniformCudaHipRt`'s constructor in C++20 mode #1968 +- Fixed typo in memory fence documentation #1944 +- Fixed compilation issues for CPU-only jobs running on GPU CI runners #1939 +- Fixed clang-specific warning suppression occurring for other compilers in HIP back-end #1914 +- Fixed CI clang installation #1907 +- Fixed CUDA async / mapped memory allocation bug #1868 +- Fixed several bugs related to thread safety #1850 #1975 #1987 #1989 #2026 #2057 +- Fixed `alpaka::createView` for containers without a size argument #1847 +- Fixed behaviour of `alpaka::detail::nextDivisorLowerOrEqual` #1829 +- Fixed missing `final` keyword for accelerator inheritance #1816 +- Fixed missing template parameters in `alpaka::allocBuf(host, extent)` #1777 +- Fixed look-up of `atomic*_block()` functions for the CUDA back-end when clang is the device compiler #1773 +- Fixed mixed-type and mixed-precision `alpaka::math::pow` implementation #1733 +- Fixed `alpaka::QueueGenericThreadsNonBlocking` not completing running tasks upon its destruction #1728 +- Fixed host memory allocation / pinning on OpenPOWER platforms #1725 +- Fixed `alpaka::ffs` CPU intrinsic in C++20 mode #1716 +- Fixed typo in cheatsheet example for `alpaka::getWorkDiv` #1711 +- Fixed missing braces around aggregate initializers #1704 +- Fixed CI installation of CUDA apt repository keys #1703 ## [0.9.0] - 2022-04-21 ### Compatibility Changes: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 89975c03820b..bae65b41b724 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,19 +2,19 @@ ## Formatting -Please format your code before before opening pull requests using clang-format 14 and the .clang-format file placed in the repository root. +Please format your code before before opening pull requests using clang-format 16 and the .clang-format file placed in the repository root. ### Visual Studio and CLion Suport for clang-format is built-in since Visual Studio 2017 15.7 and CLion 2019.1. The .clang-format file in the repository will be automatically detected and formatting is done as you type, or triggered when pressing the format hotkey. ### Bash -First install clang-format-14. Instructions therefore can be found on the web. +First install clang-format-16. Instructions therefore can be found on the web. To format your changes since branching off develop, you can run this command in bash: ``` -git clang-format-14 develop +git clang-format-16 develop ``` To format all code in your working copy, you can run this command in bash: ``` -find -iname '*.cpp' -o -iname '*.hpp' | xargs clang-format-14 -i +find -iname '*.cpp' -o -iname '*.hpp' | xargs clang-format-16 -i ``` diff --git a/README.md b/README.md index 229a520ca33d..8fd6b25f94ec 100644 --- a/README.md +++ b/README.md @@ -65,17 +65,17 @@ Supported Compilers This library uses C++17 (or newer when available). -| Accelerator Back-end | gcc 9.5
(Linux) | gcc 10.4 / 11.1
(Linux) | gcc 12.3
(Linux) | gcc 13.1
(Linux) | clang 9
(Linux) | clang 10 / 11
(Linux) | clang 12
(Linux) | clang 13
(Linux) | clang 14
(Linux) | clang 15
(Linux) | clang 16
(Linux) | icpx 2023.1.0 / 2023.2.0 (Linux) | Xcode 13.2.1 / 14.2 / 14.3.1
(macOS) | Visual Studio 2022
(Windows) | -|--------------------------------------------------------------------------------|-------------------------------------------------|-------------------------------------------------|---------------------------------------------|------------------------|------------------------------------------------------------|-------------------------------------------------------|-------------------------------------------------|---------------------------------------------|---------------------------------------------------|-------------------------------------------|-------------------------------------------|----------------------------------|-------------------------------------------------------|--------------------------------------| -| Serial | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| OpenMP 2.0+ blocks | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark:[^3] | :white_check_mark: | :white_check_mark: | -| OpenMP 2.0+ threads | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark:[^3] | :white_check_mark: | :white_check_mark: | -| std::thread | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| TBB | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| CUDA (nvcc) | :white_check_mark:
(CUDA 11.0 - 12.2)[^2] | :white_check_mark:
(CUDA 11.4 - 12.0)[^2] | :white_check_mark:
(CUDA 12.0 - 12.2) | :x: | :white_check_mark:
(CUDA 11.0-11.2; 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.7 - 12.0) | :white_check_mark:
(CUDA 11.8 - 12.0) | :white_check_mark:
(CUDA 12.2) | :x: | :x: | :x: | :x: | -| CUDA (clang) | - | - | - | :x: | :x: | :x: | :x: | :x: | :white_check_mark: (CUDA 11.0 - 11.5) | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :x: | - | - | -| [HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang) | - | - | - | :x: | :x: | :x: | :x: | :x: | :white_check_mark: (HIP 5.0 - 5.2) | :white_check_mark: (HIP 5.3 - 5.4) | :white_check_mark: (HIP 5.5) | :x: | - | - | -| SYCL | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :white_check_mark:[^4] | :x: | :x: | +| Accelerator Back-end | gcc 9.5
(Linux) | gcc 10.4 / 11.1
(Linux) | gcc 12.3
(Linux) | gcc 13.1
(Linux) | clang 9
(Linux) | clang 10 / 11
(Linux) | clang 12
(Linux) | clang 13
(Linux) | clang 14
(Linux) | clang 15
(Linux) | clang 16
(Linux) | clang 17
(Linux) | icpx 2023.1.0 / 2023.2.0 (Linux) | Xcode 13.2.1 / 14.2 / 14.3.1
(macOS) | Visual Studio 2022
(Windows) | +|--------------------------------------------------------------------------------|-------------------------------------------------|-------------------------------------------------|---------------------------------------------|------------------------|------------------------------------------------------------|-------------------------------------------------------|-------------------------------------------------|---------------------------------------------|---------------------------------------------------|-------------------------------------------|-------------------------------------------|-------------------------------------------|----------------------------------|-------------------------------------------------------|--------------------------------------| +| Serial | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| OpenMP 2.0+ blocks | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark:[^3] | :white_check_mark: | :white_check_mark: | +| OpenMP 2.0+ threads | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark:[^3] | :white_check_mark: | :white_check_mark: | +| std::thread | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| TBB | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| CUDA (nvcc) | :white_check_mark:
(CUDA 11.0 - 12.2)[^2] | :white_check_mark:
(CUDA 11.4 - 12.0)[^2] | :white_check_mark:
(CUDA 12.0 - 12.2) | :x: | :white_check_mark:
(CUDA 11.0-11.2; 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.2, 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.6 - 12.0)[^2] | :white_check_mark:
(CUDA 11.7 - 12.0) | :white_check_mark:
(CUDA 11.8 - 12.0) | :white_check_mark:
(CUDA 12.2) | :x: | :x: | :x: | :x: | :x: | +| CUDA (clang) | - | - | - | :x: | :x: | :x: | :x: | :x: | :white_check_mark: (CUDA 11.0 - 11.5) | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.5)[^1] | :white_check_mark: (CUDA 11.0 - 11.8)[^1] | :x: | - | - | +| [HIP](https://alpaka.readthedocs.io/en/latest/install/HIP.html) (clang) | - | - | - | :x: | :x: | :x: | :x: | :x: | :white_check_mark: (HIP 5.0 - 5.2) | :white_check_mark: (HIP 5.3 - 5.4) | :white_check_mark: (HIP 5.5) | :x: | :x: | - | - | +| SYCL | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :x: | :white_check_mark:[^4] | :x: | :x: | Other compilers or combinations marked with :x: in the table above may work but are not tested in CI and are therefore not explicitly supported. @@ -223,31 +223,37 @@ Authors - Benjamin Worpitz* (original author) - Dr. Sergei Bastrakov* -- Dr. Andrea Bocci +- Kseniia Bastrakova +- Dr. Andrea Bocci* - Dr. Antonio Di Pilato - Simeon Ehrig +- Luca Ferragina - Bernhard Manfred Gruber* -- Dr. Axel Huebl +- Christian Kaever - Dr. Jeffrey Kelling -- Dr. Felice Pantaleo +- Dr. Stewart Martin-Haugh +- Aurora Perego - Jan Stephan* -- Dr. Jiří Vyskočil - René Widera* +- Dr. Jeffrey Young ### Former Members, Contributions and Thanks - Dr. Michael Bussmann - Mat Colgrove - Valentin Gehrke +- Dr. Axel Hübl - Maximilian Knespel - Jakob Krude - Alexander Matthes - Hauke Mewes - Phil Nash +- Dr. Felice Pantaleo - Dr. David M. Rogers - Mutsuo Saito - Jonas Schenke - Daniel Vollmer +- Dr. Jiří Vyskočil - Matthias Werner - Bert Wesarg - Malte Zacharias diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index fdbc38dc0067..cad0809b3826 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -614,7 +614,6 @@ if(alpaka_ACC_SYCL_ENABLE) if(alpaka_SYCL_ONEAPI_FPGA) target_compile_definitions(alpaka INTERFACE "ALPAKA_SYCL_ONEAPI_FPGA") - alpaka_set_compiler_options(DEVICE target alpaka "-fintelfpga") if(alpaka_SYCL_ONEAPI_FPGA_MODE STREQUAL "emulation") target_compile_definitions(alpaka INTERFACE "ALPAKA_FPGA_EMULATION") diff --git a/docs/source/conf.py b/docs/source/conf.py index cc5540c34306..56623cbd60fa 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ # The short X.Y version. version = u'1.0.0' # The full version, including alpha/beta/rc tags. -release = u'1.0.0-develop' +release = u'1.0.0-rc1' # The master toctree document. master_doc = 'index' diff --git a/docs/source/dev/style.rst b/docs/source/dev/style.rst index 3f1ed6712130..c95bfc5b51db 100644 --- a/docs/source/dev/style.rst +++ b/docs/source/dev/style.rst @@ -14,13 +14,13 @@ whitespace and braces automatically. Usage: .. code-block:: bash - clang-format-14 -i + clang-format-16 -i * If you want to format the entire code base execute the following command from alpaka's top-level directory: .. code-block:: bash - find example include test -name '*.hpp' -o -name '*.cpp' | xargs clang-format-14 -i + find example include test -name '*.hpp' -o -name '*.cpp' | xargs clang-format-16 -i Windows users should use `Visual Studio's native clang-format integration `. diff --git a/include/alpaka/atomic/AtomicOmpBuiltIn.hpp b/include/alpaka/atomic/AtomicOmpBuiltIn.hpp index 6d4dc96e72ec..440b373fc460 100644 --- a/include/alpaka/atomic/AtomicOmpBuiltIn.hpp +++ b/include/alpaka/atomic/AtomicOmpBuiltIn.hpp @@ -187,7 +187,8 @@ namespace alpaka # pragma omp atomic capture compare { old = ref; - ref = (ref <= value) ? ref : value; + if(value < ref) + ref = value; } return old; } @@ -205,7 +206,8 @@ namespace alpaka # pragma omp atomic capture compare { old = ref; - ref = (ref >= value) ? ref : value; + if(value > ref) + ref = value; } return old; } @@ -217,21 +219,12 @@ namespace alpaka { ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T { + // TODO(bgruber): atomic increment with wrap around is not implementable in OpenMP 5.1 T old; - auto& ref(*addr); -// atomically update ref, but capture the original value in old -# if BOOST_COMP_GNUC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wconversion" -# endif -# pragma omp atomic capture compare +# pragma omp critical(AlpakaOmpAtomicOp) { - old = ref; - ref = ((ref >= value) ? 0 : (ref + 1)); + old = AtomicInc{}(addr, value); } -# if BOOST_COMP_GNUC -# pragma GCC diagnostic pop -# endif return old; } }; @@ -242,21 +235,12 @@ namespace alpaka { ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T { + // TODO(bgruber): atomic decrement with wrap around is not implementable in OpenMP 5.1 T old; - auto& ref(*addr); -// atomically update ref, but capture the original value in old -# if BOOST_COMP_GNUC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wconversion" -# endif -# pragma omp atomic capture compare +# pragma omp critical(AlpakaOmpAtomicOp) { - old = ref; - ref = ((ref == 0) || (ref > value)) ? value : (ref - 1); + old = AtomicDec{}(addr, value); } -# if BOOST_COMP_GNUC -# pragma GCC diagnostic pop -# endif return old; } }; @@ -293,8 +277,8 @@ namespace alpaka ALPAKA_FN_HOST static auto atomicOp(AtomicOmpBuiltIn const&, T* const addr, T const& value) -> T { T old; -// \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all -// threads. + // \TODO: Currently not only the access to the same memory location is protected by a mutex but all + // atomic ops on all threads. # pragma omp critical(AlpakaOmpAtomicOp) { old = TOp()(addr, value); @@ -309,8 +293,8 @@ namespace alpaka T const& value) -> T { T old; -// \TODO: Currently not only the access to the same memory location is protected by a mutex but all atomic ops on all -// threads. + // \TODO: Currently not only the access to the same memory location is protected by a mutex but all + // atomic ops on all threads. # pragma omp critical(AlpakaOmpAtomicOp2) { old = TOp()(addr, compare, value); diff --git a/include/alpaka/idx/MapIdx.hpp b/include/alpaka/idx/MapIdx.hpp index af0ccea41701..f081252878c0 100644 --- a/include/alpaka/idx/MapIdx.hpp +++ b/include/alpaka/idx/MapIdx.hpp @@ -5,6 +5,7 @@ #pragma once #include "alpaka/core/Common.hpp" +#include "alpaka/core/Unreachable.hpp" #include "alpaka/vec/Traits.hpp" #include "alpaka/vec/Vec.hpp" @@ -50,6 +51,8 @@ namespace alpaka } else static_assert(!sizeof(TElem), "Not implemented"); + + ALPAKA_UNREACHABLE({}); } //! Maps an N dimensional index to a N dimensional position based on the pitches of a view without padding or a @@ -89,5 +92,7 @@ namespace alpaka } else static_assert(!sizeof(TElem), "Not implemented"); + + ALPAKA_UNREACHABLE({}); } } // namespace alpaka diff --git a/include/alpaka/intrinsic/IntrinsicCpu.hpp b/include/alpaka/intrinsic/IntrinsicCpu.hpp index 98bc9df0af77..5db927bd44a8 100644 --- a/include/alpaka/intrinsic/IntrinsicCpu.hpp +++ b/include/alpaka/intrinsic/IntrinsicCpu.hpp @@ -5,6 +5,7 @@ #pragma once #include "alpaka/core/BoostPredef.hpp" +#include "alpaka/core/Unreachable.hpp" #include "alpaka/intrinsic/IntrinsicFallback.hpp" #include "alpaka/intrinsic/Traits.hpp" @@ -52,6 +53,7 @@ namespace alpaka // Fallback to standard library return static_cast(std::bitset(value).count()); #endif + ALPAKA_UNREACHABLE(0); } }; @@ -79,6 +81,7 @@ namespace alpaka #else return alpaka::detail::ffsFallback(value); #endif + ALPAKA_UNREACHABLE(0); } }; } // namespace trait diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp index 7863d0173eda..e2d7ed3d1915 100644 --- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp +++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp @@ -50,6 +50,10 @@ namespace alpaka { namespace detail { +# if BOOST_COMP_CLANG +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-template" +# endif //! The GPU CUDA/HIP kernel entry point. // \NOTE: 'A __global__ function or function template cannot have a trailing return type.' // We have put the function into a shallow namespace and gave it a short name, so the mangled name in the @@ -74,6 +78,9 @@ namespace alpaka # endif kernelFnObj(const_cast(acc), args...); } +# if BOOST_COMP_CLANG +# pragma clang diagnostic pop +# endif } // namespace detail namespace uniform_cuda_hip diff --git a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp index 04dcbb10afd3..79340323ff01 100644 --- a/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp +++ b/include/alpaka/mem/buf/BufUniformCudaHipRt.hpp @@ -310,8 +310,8 @@ namespace alpaka return { dev, reinterpret_cast(memPtr), - [queue = std::move(queue)](TElem* ptr) - { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, queue.getNativeHandle())); }, + [q = std::move(queue)](TElem* ptr) + { ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK_NOEXCEPT(TApi::freeAsync(ptr, q.getNativeHandle())); }, extent, static_cast(width) * sizeof(TElem)}; } diff --git a/include/alpaka/mem/buf/Traits.hpp b/include/alpaka/mem/buf/Traits.hpp index 5494b7fa5619..33e7c9bda7f1 100644 --- a/include/alpaka/mem/buf/Traits.hpp +++ b/include/alpaka/mem/buf/Traits.hpp @@ -164,14 +164,14 @@ namespace alpaka //! this function is provided for convenience in the cases where the difference is not relevant, //! and the pinned/mapped memory is only used as a performance optimisation. //! - //! \tparam TPlatform The platform from which the buffer is accessible. //! \tparam TElem The element type of the returned buffer. //! \tparam TIdx The linear index type of the buffer. //! \tparam TExtent The extent type of the buffer. + //! \tparam TPlatform The platform from which the buffer is accessible. //! \param host The host device to allocate the buffer on. //! \param extent The extent of the buffer. //! \return The newly allocated buffer. - template + template ALPAKA_FN_HOST auto allocMappedBufIfSupported( DevCpu const& host, TPlatform const& platform, diff --git a/include/alpaka/platform/PlatformCpu.hpp b/include/alpaka/platform/PlatformCpu.hpp index a1a2cb1137d1..c431fd418785 100644 --- a/include/alpaka/platform/PlatformCpu.hpp +++ b/include/alpaka/platform/PlatformCpu.hpp @@ -16,6 +16,12 @@ namespace alpaka //! The CPU device platform. struct PlatformCpu : concepts::Implements { +#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ + && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) + // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295 + // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized" + char c = {}; +#endif }; namespace trait diff --git a/include/alpaka/platform/PlatformUniformCudaHipRt.hpp b/include/alpaka/platform/PlatformUniformCudaHipRt.hpp index ffda2de2faab..9784f54520b2 100644 --- a/include/alpaka/platform/PlatformUniformCudaHipRt.hpp +++ b/include/alpaka/platform/PlatformUniformCudaHipRt.hpp @@ -28,6 +28,12 @@ namespace alpaka template struct PlatformUniformCudaHipRt : concepts::Implements> { +# if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ + && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) + // This is a workaround for g++-11 bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96295 + // g++-11 complains in *all* places where a PlatformCpu is used, that it "may be used uninitialized" + char c = {}; +# endif }; namespace trait diff --git a/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp b/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp index 50d2285c1fcd..3a85fac26e41 100644 --- a/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp +++ b/include/alpaka/queue/cuda_hip/QueueUniformCudaHipRt.hpp @@ -212,7 +212,7 @@ namespace alpaka { auto data = std::unique_ptr(reinterpret_cast(arg)); auto& queue = data->q; - auto f = queue.m_callbackThread.submit([data = std::move(data)] { data->t(); }); + auto f = queue.m_callbackThread.submit([d = std::move(data)] { d->t(); }); f.wait(); } diff --git a/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp b/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp index c9518b1b9fef..e0c036128770 100644 --- a/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp +++ b/include/alpaka/rand/Philox/MultiplyAndSplit64to32.hpp @@ -11,13 +11,13 @@ namespace alpaka::rand { /// Get high 32 bits of a 64-bit number - ALPAKA_FN_HOST_ACC static constexpr auto high32Bits(std::uint64_t const x) -> std::uint32_t + ALPAKA_FN_HOST_ACC inline constexpr auto high32Bits(std::uint64_t const x) -> std::uint32_t { return static_cast(x >> 32); } /// Get low 32 bits of a 64-bit number - ALPAKA_FN_HOST_ACC static constexpr auto low32Bits(std::uint64_t const x) -> std::uint32_t + ALPAKA_FN_HOST_ACC inline constexpr auto low32Bits(std::uint64_t const x) -> std::uint32_t { return static_cast(x & 0xffff'ffff); } @@ -30,7 +30,7 @@ namespace alpaka::rand * @param resultLow low 32 bits of the product a*b */ // TODO: See single-instruction implementations in original Philox source code - ALPAKA_FN_HOST_ACC static constexpr void multiplyAndSplit64to32( + ALPAKA_FN_HOST_ACC inline constexpr void multiplyAndSplit64to32( std::uint64_t const a, std::uint64_t const b, std::uint32_t& resultHigh, diff --git a/include/alpaka/rand/TinyMT/tinymt32.h b/include/alpaka/rand/TinyMT/tinymt32.h index ffa7c1ca9a2b..55a946f2d435 100644 --- a/include/alpaka/rand/TinyMT/tinymt32.h +++ b/include/alpaka/rand/TinyMT/tinymt32.h @@ -37,6 +37,7 @@ #if BOOST_COMP_CLANG # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wold-style-cast" +# pragma clang diagnostic ignored "-Wunused-function" #endif #if BOOST_COMP_GNUC # pragma GCC diagnostic push diff --git a/include/alpaka/test/KernelExecutionFixture.hpp b/include/alpaka/test/KernelExecutionFixture.hpp index 8d783be62186..6d2cf31f8176 100644 --- a/include/alpaka/test/KernelExecutionFixture.hpp +++ b/include/alpaka/test/KernelExecutionFixture.hpp @@ -25,13 +25,6 @@ namespace alpaka::test template class KernelExecutionFixture { -#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ - && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) -// g++-11 (wrongly) believes that m_platformHost is used in an uninitialized state. -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - public: using Acc = TAcc; using Dim = alpaka::Dim; @@ -82,9 +75,5 @@ namespace alpaka::test Device m_device{getDevByIdx(m_platform, 0)}; Queue m_queue{m_device}; WorkDiv m_workDiv; -#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ - && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) -# pragma GCC diagnostic pop -#endif }; } // namespace alpaka::test diff --git a/include/alpaka/test/queue/QueueTestFixture.hpp b/include/alpaka/test/queue/QueueTestFixture.hpp index d55bf70df7e5..ad6f8150afc6 100644 --- a/include/alpaka/test/queue/QueueTestFixture.hpp +++ b/include/alpaka/test/queue/QueueTestFixture.hpp @@ -12,12 +12,6 @@ namespace alpaka::test template struct QueueTestFixture { -#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ - && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) -// g++-11 (wrongly) believes that m_platform is used in an uninitialized state. -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif using Dev = std::tuple_element_t<0, TDevQueue>; using Queue = std::tuple_element_t<1, TDevQueue>; using Platform = alpaka::Platform; @@ -25,9 +19,5 @@ namespace alpaka::test Platform m_platform{}; Dev m_dev{getDevByIdx(m_platform, 0)}; Queue m_queue{m_dev}; -#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ - && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) -# pragma GCC diagnostic pop -#endif }; } // namespace alpaka::test diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index a9b3a56fe69f..e420b615585b 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -49,16 +49,22 @@ namespace alpaka::warp::trait // Restrict to warpSize <= 32 for now. static auto activemask(warp::WarpGenericSycl const& warp) -> std::uint32_t { - // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code - // regions anyway we return the full mask. - auto const sub_group = warp.m_item_warp.get_sub_group(); - auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true); - // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, - // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. - // Restrict to warpSize <= 32 for now. - std::uint32_t bits = 0; - mask.extract_bits(bits); - return bits; + static_assert(!sizeof(warp), "activemask is not supported on SYCL"); + // SYCL does not have an API to get the activemask. It is also questionable (to me, bgruber) whether an + // "activemask" even exists on some hardware architectures, since the idea is bound to threads being + // "turned off" when they take different control flow in a warp. A SYCL implementation could run each + // thread as a SIMD lane, in which cause the "thread" is always active, but some SIMD lanes are either + // predicated off, or side-effects are masked out when writing them back. + // + // An implementation via oneAPI's sycl::ext::oneapi::group_ballot causes UB, because activemask is expected + // to be callable when less than all threads are active in a warp (CUDA). But SYCL requires all threads of + // a group to call the function. + // + // Intel's CUDA -> SYCL migration tool also suggests that there is no direct equivalent and the user must + // rewrite their kernel logic. See also: + // https://oneapi-src.github.io/SYCLomatic/dev_guide/diagnostic_ref/dpct1086.html + + return ~std::uint32_t{0}; } }; diff --git a/script/job_generator/alpaka_filter.py b/script/job_generator/alpaka_filter.py index c7a74d935472..71a43c1f4b70 100644 --- a/script/job_generator/alpaka_filter.py +++ b/script/job_generator/alpaka_filter.py @@ -24,7 +24,7 @@ def alpaka_post_filter(row: List) -> bool: and row[param_map[BUILD_TYPE]][VERSION] == CMAKE_DEBUG and row_check_name(row, DEVICE_COMPILER, "==", CLANG_CUDA) ): - for clang_cuda_version in ["15", "16"]: + for clang_cuda_version in ["15", "16", "17"]: if row_check_version(row, HOST_COMPILER, "==", clang_cuda_version): return False diff --git a/script/job_generator/versions.py b/script/job_generator/versions.py index 2f2251306d25..d45f74d4b42d 100644 --- a/script/job_generator/versions.py +++ b/script/job_generator/versions.py @@ -12,7 +12,7 @@ sw_versions: Dict[str, List[str]] = { GCC: ["9", "10", "11", "12", "13"], - CLANG: ["9", "10", "11", "12", "13", "14", "15", "16"], + CLANG: ["9", "10", "11", "12", "13", "14", "15", "16", "17"], NVCC: [ "11.0", "11.1", diff --git a/script/run_generate.sh b/script/run_generate.sh index 96a0d516f374..435aee3a8b9d 100755 --- a/script/run_generate.sh +++ b/script/run_generate.sh @@ -78,7 +78,7 @@ mkdir -p build/ cd build/ "${ALPAKA_CI_CMAKE_EXECUTABLE}" --log-level=VERBOSE -G "${ALPAKA_CI_CMAKE_GENERATOR}" ${ALPAKA_CI_CMAKE_GENERATOR_PLATFORM}\ - -Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON \ + -Dalpaka_BUILD_EXAMPLES=ON -DBUILD_TESTING=ON "$(env2cmake alpaka_ENABLE_WERROR)" \ "$(env2cmake BOOST_ROOT)" -DBOOST_LIBRARYDIR="${ALPAKA_CI_BOOST_LIB_DIR}/lib" -DBoost_USE_STATIC_LIBS=ON -DBoost_USE_MULTITHREADED=ON -DBoost_USE_STATIC_RUNTIME=OFF -DBoost_ARCHITECTURE="-x64" \ "$(env2cmake CMAKE_BUILD_TYPE)" "$(env2cmake CMAKE_CXX_FLAGS)" "$(env2cmake CMAKE_C_COMPILER)" "$(env2cmake CMAKE_CXX_COMPILER)" "$(env2cmake CMAKE_EXE_LINKER_FLAGS)" "$(env2cmake CMAKE_CXX_EXTENSIONS)"\ "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE)" "$(env2cmake alpaka_ACC_CPU_B_SEQ_T_THREADS_ENABLE)" \ diff --git a/test/common/devCompileOptions.cmake b/test/common/devCompileOptions.cmake index 6557c1a078af..84e892bf17d7 100644 --- a/test/common/devCompileOptions.cmake +++ b/test/common/devCompileOptions.cmake @@ -10,7 +10,6 @@ if(alpaka_ACC_GPU_CUDA_ENABLE AND (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")) if(alpaka_ENABLE_WERROR) list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:--Wreorder>") list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:--Wdefault-stream-launch>") - list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:--Wmissing-launch-bounds>") list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:--Wext-lambda-captures-this>") list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:--Werror all-warnings>") else() @@ -124,6 +123,7 @@ if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Walloc-zero") list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Walloca") list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wcast-align=strict") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-unused-value>") # occurs in nvcc-generated code endif() # Clang, AppleClang, ICPX @@ -146,9 +146,26 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Inte list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wno-extra-semi-stmt") # Silence warnings caused by nvcc-generated code and -Weverything - list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-reserved-id-macro>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-missing-noreturn>") list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-missing-variable-declarations>") list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-old-style-cast>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-overlength-strings>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-reserved-id-macro>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-unused-macros>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-used-but-marked-unused>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-zero-as-null-pointer-constant>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-unreachable-code>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-extra-semi>") + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-deprecated>") + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-reserved-identifier>") + endif() + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 15.0) + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-gnu-line-marker>") + endif() + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10.0) + list(APPEND alpaka_DEV_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -Wno-deprecated-copy>") + endif() if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0) list(APPEND alpaka_DEV_COMPILE_OPTIONS "-Wno-poison-system-directories") diff --git a/test/integ/axpy/src/axpy.cpp b/test/integ/axpy/src/axpy.cpp index 29a82eb96b55..c5742814a610 100644 --- a/test/integ/axpy/src/axpy.cpp +++ b/test/integ/axpy/src/axpy.cpp @@ -76,7 +76,6 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs) using Val = float; using DevAcc = alpaka::Dev; - using PlatformAcc = alpaka::Platform; using QueueAcc = alpaka::test::DefaultQueue; // Create the kernel function object. @@ -109,9 +108,9 @@ TEMPLATE_LIST_TEST_CASE("axpy", "[axpy]", TestAccs) << std::endl; // Allocate host memory buffers in pinned memory. - auto memBufHostX = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); - auto memBufHostOrigY = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); - auto memBufHostY = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto memBufHostX = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto memBufHostOrigY = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto memBufHostY = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); Val* const pBufHostX = alpaka::getPtrNative(memBufHostX); Val* const pBufHostOrigY = alpaka::getPtrNative(memBufHostOrigY); Val* const pBufHostY = alpaka::getPtrNative(memBufHostY); diff --git a/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp b/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp index 5214dfd1b64b..59eb0bdd7600 100644 --- a/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp +++ b/test/integ/hostOnlyAPI/src/hostOnlyAPI.cpp @@ -47,8 +47,7 @@ TEMPLATE_LIST_TEST_CASE("hostOnlyAPI", "[hostOnlyAPI]", TestAccs) HostQueue hostQueue(host); // host buffer - auto h_buffer1 - = alpaka::allocMappedBufIfSupported, int, Idx>(host, platformAcc, Vec1D{Idx{42}}); + auto h_buffer1 = alpaka::allocMappedBufIfSupported(host, platformAcc, Vec1D{Idx{42}}); INFO( "host buffer allocated at " << alpaka::getPtrNative(h_buffer1) << " with " << alpaka::getExtentProduct(h_buffer1) << " element(s)"); diff --git a/test/integ/mandelbrot/src/mandelbrot.cpp b/test/integ/mandelbrot/src/mandelbrot.cpp index b56b11ca5802..ce94e178de16 100644 --- a/test/integ/mandelbrot/src/mandelbrot.cpp +++ b/test/integ/mandelbrot/src/mandelbrot.cpp @@ -309,7 +309,7 @@ TEMPLATE_LIST_TEST_CASE("mandelbrot", "[mandelbrot]", TestAccs) << std::endl; // allocate host memory, potentially pinned for faster copy to/from the accelerator. - auto bufColorHost = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto bufColorHost = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); // Allocate the buffer on the accelerator. auto bufColorAcc = alpaka::allocBuf(devAcc, extent); diff --git a/test/integ/matMul/src/matMul.cpp b/test/integ/matMul/src/matMul.cpp index bf45822e9b69..149d94df73c9 100644 --- a/test/integ/matMul/src/matMul.cpp +++ b/test/integ/matMul/src/matMul.cpp @@ -162,8 +162,6 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) using Val = std::uint32_t; using Vec2 = alpaka::Vec; - using DevAcc = alpaka::Dev; - using PlatformAcc = alpaka::Platform; using QueueAcc = alpaka::test::DefaultQueue>; using QueueHost = alpaka::QueueCpuNonBlocking; @@ -217,7 +215,7 @@ TEMPLATE_LIST_TEST_CASE("matMul", "[matMul]", TestAccs) auto bufBHost = alpaka::createView(devHost, bufBHost1d.data(), extentB); // Allocate C and set it to zero. - auto bufCHost = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extentC); + auto bufCHost = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extentC); alpaka::memset(queueHost, bufCHost, 0u); // Allocate the buffers on the accelerator. diff --git a/test/integ/separableCompilation/src/main.cpp b/test/integ/separableCompilation/src/main.cpp index ff7b79853780..cec49acf5049 100644 --- a/test/integ/separableCompilation/src/main.cpp +++ b/test/integ/separableCompilation/src/main.cpp @@ -103,9 +103,9 @@ TEMPLATE_LIST_TEST_CASE("separableCompilation", "[separableCompilation]", TestAc << ", numElements:" << numElements << ")" << std::endl; // Allocate host memory buffers, potentially pinned for faster copy to/from the accelerator. - auto memBufHostA = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); - auto memBufHostB = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); - auto memBufHostC = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto memBufHostA = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto memBufHostB = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); + auto memBufHostC = alpaka::allocMappedBufIfSupported(devHost, platformAcc, extent); // Initialize the host input vectors for(Idx i = 0; i < numElements; ++i) diff --git a/test/unit/math/src/Buffer.hpp b/test/unit/math/src/Buffer.hpp index 2bfc7f156ba5..ac0f002c29c0 100644 --- a/test/unit/math/src/Buffer.hpp +++ b/test/unit/math/src/Buffer.hpp @@ -62,10 +62,7 @@ namespace alpaka // Constructor needs to initialize all Buffer. Buffer(DevAcc const& devAcc) : devHost{alpaka::getDevByIdx(platformHost, 0)} - , hostBuffer{alpaka::allocMappedBufIfSupported( - devHost, - platformAcc, - Tcapacity)} + , hostBuffer{alpaka::allocMappedBufIfSupported(devHost, platformAcc, Tcapacity)} , devBuffer{alpaka::allocBuf(devAcc, Tcapacity)} , pHostBuffer{alpaka::getPtrNative(hostBuffer)} , pDevBuffer{alpaka::getPtrNative(devBuffer)} diff --git a/test/unit/mem/copy/src/BufSlicing.cpp b/test/unit/mem/copy/src/BufSlicing.cpp index a978562e453e..6169fdaf5ff2 100644 --- a/test/unit/mem/copy/src/BufSlicing.cpp +++ b/test/unit/mem/copy/src/BufSlicing.cpp @@ -20,12 +20,6 @@ template> struct TestContainer { -#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ - && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) -// g++-11 (wrongly) believes that platformHost is used in an uninitialized state. -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif using AccQueueProperty = alpaka::Blocking; using DevQueue = alpaka::Queue; using DevAcc = alpaka::Dev; @@ -106,10 +100,6 @@ struct TestContainer REQUIRE(ptrA[i] == Catch::Approx(ptrB[i])); } } -#if defined(BOOST_COMP_GNUC) && BOOST_COMP_GNUC >= BOOST_VERSION_NUMBER(11, 0, 0) \ - && BOOST_COMP_GNUC < BOOST_VERSION_NUMBER(12, 0, 0) -# pragma GCC diagnostic pop -#endif }; using DataTypes = std::tuple; diff --git a/test/unit/mem/fence/src/FenceTest.cpp b/test/unit/mem/fence/src/FenceTest.cpp index 260d6e845ab8..4e1c0df05259 100644 --- a/test/unit/mem/fence/src/FenceTest.cpp +++ b/test/unit/mem/fence/src/FenceTest.cpp @@ -188,7 +188,7 @@ TEMPLATE_LIST_TEST_CASE("FenceTest", "[fence]", TestAccs) auto const numElements = Idx{2ul}; auto const extent = alpaka::Vec{numElements}; - auto vars_host = alpaka::allocMappedBufIfSupported(host, platformAcc, extent); + auto vars_host = alpaka::allocMappedBufIfSupported(host, platformAcc, extent); auto vars_dev = alpaka::allocBuf(dev, extent); vars_host[0] = 1; vars_host[1] = 2; diff --git a/test/unit/warp/src/Activemask.cpp b/test/unit/warp/src/Activemask.cpp index d433698a836c..223f3535f670 100644 --- a/test/unit/warp/src/Activemask.cpp +++ b/test/unit/warp/src/Activemask.cpp @@ -2,6 +2,8 @@ * SPDX-License-Identifier: MPL-2.0 */ +#include +#include #include #include #include @@ -63,63 +65,77 @@ struct alpaka::trait::WarpSize TEMPLATE_LIST_TEST_CASE("activemask", "[warp]", alpaka::test::TestAccs) { using Acc = TestType; - using Dim = alpaka::Dim; - using Idx = alpaka::Idx; - - auto const platform = alpaka::Platform{}; - auto const dev = alpaka::getDevByIdx(platform, 0); - auto const warpExtents = alpaka::getWarpSizes(dev); - for(auto const warpExtent : warpExtents) + if constexpr(alpaka::accMatchesTags< + Acc, + alpaka::TagCpuSycl, + alpaka::TagGpuSyclIntel, + alpaka::TagFpgaSyclIntel, + alpaka::TagGenericSycl>) { - auto const scalar = Dim::value == 0 || warpExtent == 1; - if(scalar) - { - alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); - CHECK(fixture(ActivemaskSingleThreadWarpTestKernel{})); - } - else + std::cout << "Test disabled for SYCL\n"; + return; + } + else + { + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + + auto const platform = alpaka::Platform{}; + auto const dev = alpaka::getDevByIdx(platform, 0); + auto const warpExtents = alpaka::getWarpSizes(dev); + for(auto const warpExtent : warpExtents) { - using ExecutionFixture = alpaka::test::KernelExecutionFixture; - auto const gridBlockExtent = alpaka::Vec::all(2); - // Enforce one warp per thread block - auto blockThreadExtent = alpaka::Vec::ones(); - blockThreadExtent[0] = static_cast(warpExtent); - auto const threadElementExtent = alpaka::Vec::ones(); - auto workDiv = typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; - auto fixture = ExecutionFixture{workDiv}; - if(warpExtent == 4) + auto const scalar = Dim::value == 0 || warpExtent == 1; + if(scalar) { - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) - { - CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<4>{}, inactiveThreadIdx)); - } + alpaka::test::KernelExecutionFixture fixture(alpaka::Vec::all(4)); + CHECK(fixture(ActivemaskSingleThreadWarpTestKernel{})); } - else if(warpExtent == 8) + else { - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + using ExecutionFixture = alpaka::test::KernelExecutionFixture; + auto const gridBlockExtent = alpaka::Vec::all(2); + // Enforce one warp per thread block + auto blockThreadExtent = alpaka::Vec::ones(); + blockThreadExtent[0] = static_cast(warpExtent); + auto const threadElementExtent = alpaka::Vec::ones(); + auto workDiv = + typename ExecutionFixture::WorkDiv{gridBlockExtent, blockThreadExtent, threadElementExtent}; + auto fixture = ExecutionFixture{workDiv}; + if(warpExtent == 4) { - CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<8>{}, inactiveThreadIdx)); + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<4>{}, inactiveThreadIdx)); + } } - } - else if(warpExtent == 16) - { - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + else if(warpExtent == 8) { - CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<16>{}, inactiveThreadIdx)); + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<8>{}, inactiveThreadIdx)); + } } - } - else if(warpExtent == 32) - { - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + else if(warpExtent == 16) { - CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx)); + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<16>{}, inactiveThreadIdx)); + } } - } - else if(warpExtent == 64) - { - for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + else if(warpExtent == 32) + { + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<32>{}, inactiveThreadIdx)); + } + } + else if(warpExtent == 64) { - CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<64>{}, inactiveThreadIdx)); + for(auto inactiveThreadIdx = 0u; inactiveThreadIdx < warpExtent; inactiveThreadIdx++) + { + CHECK(fixture(ActivemaskMultipleThreadWarpTestKernel<64>{}, inactiveThreadIdx)); + } } } }