From 293fd9646ff5c88b2bacdad77fb8ccd70f811e84 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 4 Sep 2024 12:10:48 -0600 Subject: [PATCH 1/2] Add KOKKOS_FUNCTION etc. documentation --- docs/source/API/core/Macros.rst | 8 + .../macros-special/host_device_macros.rst | 141 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 docs/source/API/core/macros-special/host_device_macros.rst diff --git a/docs/source/API/core/Macros.rst b/docs/source/API/core/Macros.rst index 31442c748..1e51167b0 100644 --- a/docs/source/API/core/Macros.rst +++ b/docs/source/API/core/Macros.rst @@ -1,6 +1,14 @@ Macros ====== +Function Markup Macros +---------------------- + +.. toctree:: + :maxdepth: 1 + + macros-special/host_device_macros.rst + Version Macros -------------- diff --git a/docs/source/API/core/macros-special/host_device_macros.rst b/docs/source/API/core/macros-special/host_device_macros.rst new file mode 100644 index 000000000..fc6aa5e65 --- /dev/null +++ b/docs/source/API/core/macros-special/host_device_macros.rst @@ -0,0 +1,141 @@ + +``Function markup macros`` +========================== + +.. role::cpp(code) + :language: cpp + +Defined in header ```` + +Usage: + +.. code-block:: cpp + + KOKKOS_FUNCTION void foo(); + KOKKOS_INLINE_FUNCTION void foo(); + KOKKOS_FORCEINLINE_FUNCTION void foo(); + auto l = KOKKOS_LAMBDA(int i) { ... }; + auto l = KOKKOS_CLASS_LAMBDA(int i) { ... }; + +These macros deal with the management of split compilation for device and host code. +They fullfill the same purpose as the ``__host__ __device__`` markup in CUDA and HIP. +Generally only functions marked with one of these macros can be used inside of parallel +Kokkos code - i.e. all code executed in parallel algorithms must be marked up by one +of these macros. + +``KOKKOS_FUNCTION`` +------------------- + +This macro is the equivalent of ``__host__ __device__`` markup in CUDA and HIP. +Use it primarily on inline-defined member functions of classes and templated +free functions + +.. code-block:: cpp + + class Foo { + public: + // inline defined constructor + KOKKOS_FUNCTION Foo() { ... }; + + // inline defined member function + template + KOKKOS_FUNCTION void bar() const { ... } + }; + + template + KOKKOS_FUNCTION void foo(T v) { ... } + +This macro is also used for non-templated free functions in conjunction with relocatable device code - +i.e. if one wants to compile functions in some compilation unit A but call them from Kokkos +parallel constructs defined in compilation unit B. + + +``KOKKOS_INLINE_FUNCTION`` +-------------------------- + +This macro is the equivalent of ``__host__ __device__ inline`` markup in CUDA and HIP. +Use it primarily for non-templated free functions: + +.. code-block:: cpp + + KOKKOS_INLINE_FUNCTION void foo() {} + +Note that it is NOT a bug to use this macro for inline-defined member function of classes, or +templated free functions. It is simply redundant since they are by default inline. + +``KOKKOS_FORCEINLINE_FUNCTION`` +------------------------------- + +This macro is the equivalent of ``__host__ __device__`` markup in CUDA and HIP, but also uses +compiler dependent hints (if available) to enforce inlining. +This can help with some functions which are often used, but it may also hurt compilation time, +as well as runtime performance due to code-bloat. In some instances using ``KOKKOS_FORCEINLINE_FUNCTION`` +excessively can even cause compilation errors due to compiler specific limits of maximum inline limits. +Use this macro only in conjunction with performing extensive performance checks. + +.. code-block:: cpp + + class Foo { + public: + KOKKOS_FORCEINLINE_FUNCTION + Foo() { ... }; + + template + KOKKOS_FORCEINLINE_FUNCTION + void bar() const { ... } + }; + + template + KOKKOS_FORCEINLINE_FUNCTION + void foo(T v) { ... } + +This macro is also used for non-templated free functions in conjunction with relocatable device code - +i.e. if one wants to compile functions in some compilation unit A but call them from Kokkos +parallel constructs defined in compilation unit B. + + +``KOKKOS_LAMBDA`` +----------------- + +This macro provides default capture clause and host device markup for lambdas. It is the equivalent of +``[=] __host__ __device__`` in CUDA and HIP. +It is used than creating C++ lambdas to be passed to Kokkos parallel dispatch mechanisms such as +``parallel_for``, ``parallel_reduce`` and ``parallel_scan``. + +.. code-block:: cpp + + void foo(...) { + ... + parallel_for("Name", N, KOKKOS_LAMBDA(int i) { + ... + }); + ... + parallel_reduce("Name", N, KOKKOS_LAMBDA(int i, double& v) { + ... + }, result); + ... + } + +.. warning:: Do not use ``KOKKOS_LAMBDA`` inside functions marked as ``KOKKOS_FUNCTION`` etc. or within a lambda marked with ``KOKKOS_LAMBDA``. Specifically do not use ``KOKKOS_LAMBDA`` to define lambdas for nested parallel calls. CUDA does not support that. Use plain C++ syntax instead: ``[=] (int i) {...}``. + +.. warning:: When creating lambdas inside of class member functions you may need to use ``KOKKOS_CLASS_LAMBDA`` instead. + +``KOKKOS_CLASS_LAMBDA`` +----------------------- + +This macro provides default capture clause and host device markup for lambdas created inside of class member functions. It is the equivalent of +``[=, *this] __host__ __device__`` in CUDA and HIP, capturing the parent class by value instead of by reference. + +.. code-block:: cpp + + class Foo { + public: + Foo() { ... }; + + void bar() const { + parallel_for("Name", N, KOKKOS_CLASS_LAMBDA(int i) { + ... + }); + } + }; + From ee52de29225d25624c16bb7c189ff30ad1492c09 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Wed, 4 Sep 2024 14:09:21 -0600 Subject: [PATCH 2/2] Address Damien's comments --- .../macros-special/host_device_macros.rst | 69 ++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/docs/source/API/core/macros-special/host_device_macros.rst b/docs/source/API/core/macros-special/host_device_macros.rst index fc6aa5e65..4f5ceb706 100644 --- a/docs/source/API/core/macros-special/host_device_macros.rst +++ b/docs/source/API/core/macros-special/host_device_macros.rst @@ -1,6 +1,6 @@ -``Function markup macros`` -========================== +``Function Annotation Macros`` +============================== .. role::cpp(code) :language: cpp @@ -131,11 +131,76 @@ This macro provides default capture clause and host device markup for lambdas cr class Foo { public: Foo() { ... }; + int data; + KOKKOS_FUNCTION print_data() const { + printf("Data: %i\n",data); + } void bar() const { parallel_for("Name", N, KOKKOS_CLASS_LAMBDA(int i) { ... + print_data(); + printf("%i %i\n",i,data); + }); + } + }; + +Note: If one wants to avoid capturing a copy of the entire class in the lambda, one has to create local +copies of any accessed data members, and can not use non-static member functions inside the lambda: + +.. code-block:: cpp + + class Foo { + public: + Foo() { ... }; + int data; + + KOKKOS_FUNCTION print_data() const { + printf("Data: %i\n",data); + } + void bar() const { + int data_copy = data; + parallel_for("Name", N, KOKKOS_LAMBDA(int i) { + ... + // can't call member functions + // print_data(); + // use the copy of data + printf("%i %i\n",i,data_copy); }); } }; + +``KOKKOS_DEDUCTION_GUIDE`` +----------------------- + +This macro is used to annotate deduciont guides. + + +.. code-block:: cpp + + template + class Foo { + T data[N]; + public: + template + KOKKOS_FUNCTION + Foo(Args ... args):data{static_cast(args)...} {} + + KOKKOS_FUNCTION void print(int i) const { + printf("%i\n",static_cast(data[i])); + } + }; + + template + KOKKOS_DEDUCTION_GUIDE + Foo(T, Args...) -> Foo; + + void bar() { + Kokkos::parallel_for(1, KOKKOS_LAMBDA(int) { + Foo f(1, 2., 3.2f); + f.print(0); + f.print(1); + f.print(2); + }); + }