From 6a66fffcf62631ff602836e815f4778ccb298296 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Fri, 5 May 2023 14:42:44 -0700 Subject: [PATCH 01/29] Update kp_sampler_skip.cpp --- common/kokkos-sampler/kp_sampler_skip.cpp | 45 +++++++++++++++++++++-- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 773753f8b..925dda19b 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -27,6 +27,18 @@ static endFunction endForCallee = NULL; static endFunction endScanCallee = NULL; static endFunction endReduceCallee = NULL; +void getGlobFenceChoice() +{ + // re-read environment variable to get most accurate value + const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); + if (NULL != tool_globFence_str) { + tool_globFence = atoi(tool_globFence_str); + } + else { + tool_globFence = 0; + } +} + void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings* settings) { if (0 == tool_globFence) { @@ -154,7 +166,12 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; - if (((*kID) % kernelSampleSkip) == 0) { + if (((*kID) % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate value + if(tool_globFence > 0) { + // invoke tool-induced fence from Kokkos_C_Profiling_interface + } + if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -167,7 +184,12 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, } void kokkosp_end_parallel_for(const uint64_t kID) { + if ((kID % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate value + if(0 < tool_globFence ) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); @@ -181,9 +203,12 @@ void kokkosp_end_parallel_for(const uint64_t kID) { void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - + *kID = uniqID++; if (((*kID) % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate value + if(0 < tool_globFence ) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -197,6 +222,10 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, void kokkosp_end_parallel_scan(const uint64_t kID) { if ((kID % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate value + if(0 < tool_globFence ) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); @@ -213,6 +242,10 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, *kID = uniqID++; if (((*kID) % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate value + if(0 < tool_globFence ) { + // TODO:invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -225,7 +258,11 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, } void kokkosp_end_parallel_reduce(const uint64_t kID) { - if ((kID % kernelSampleSkip) == 0) { + if ((kID % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate value + if(0 < tool_globFence ) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); From 585b7bfc6e84db12ec5bc899262140830c970d61 Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Fri, 5 May 2023 14:50:34 -0700 Subject: [PATCH 02/29] formatted sampler --- common/kokkos-sampler/kp_sampler_skip.cpp | 73 ++++++++++++----------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 925dda19b..03f6c62c1 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -27,16 +27,14 @@ static endFunction endForCallee = NULL; static endFunction endScanCallee = NULL; static endFunction endReduceCallee = NULL; -void getGlobFenceChoice() -{ - // re-read environment variable to get most accurate value +void getGlobFenceChoice() { + // re-read environment variable to get most accurate value const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); if (NULL != tool_globFence_str) { tool_globFence = atoi(tool_globFence_str); - } - else { + } else { tool_globFence = 0; - } + } } void kokkosp_request_tool_settings(const uint32_t, @@ -166,12 +164,13 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; - if (((*kID) % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate value - if(tool_globFence > 0) { - // invoke tool-induced fence from Kokkos_C_Profiling_interface - } - + if (((*kID) % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate + // value + if (tool_globFence > 0) { + // invoke tool-induced fence from Kokkos_C_Profiling_interface + } + if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -184,12 +183,12 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, } void kokkosp_end_parallel_for(const uint64_t kID) { - if ((kID % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate value - if(0 < tool_globFence ) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface - } + getGlobFenceChoice(); // re-read environment variable to get most accurate + // value + if (0 < tool_globFence) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); @@ -203,12 +202,13 @@ void kokkosp_end_parallel_for(const uint64_t kID) { void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; + *kID = uniqID++; if (((*kID) % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate value - if(0 < tool_globFence ) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface - } + getGlobFenceChoice(); // re-read environment variable to get most accurate + // value + if (0 < tool_globFence) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -222,10 +222,11 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, void kokkosp_end_parallel_scan(const uint64_t kID) { if ((kID % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate value - if(0 < tool_globFence ) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface - } + getGlobFenceChoice(); // re-read environment variable to get most accurate + // value + if (0 < tool_globFence) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); @@ -242,10 +243,11 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, *kID = uniqID++; if (((*kID) % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate value - if(0 < tool_globFence ) { - // TODO:invoke tool-induced fence from Kokkos_C_Profiling_interface - } + getGlobFenceChoice(); // re-read environment variable to get most accurate + // value + if (0 < tool_globFence) { + // TODO:invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -258,11 +260,12 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, } void kokkosp_end_parallel_reduce(const uint64_t kID) { - if ((kID % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate value - if(0 < tool_globFence ) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface - } + if ((kID % kernelSampleSkip) == 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate + // value + if (0 < tool_globFence) { + // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); From 060c1aa0bc7d48a86438ba51f42fcd985f27d788 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 18 May 2023 10:37:28 -0700 Subject: [PATCH 03/29] Update kp_sampler_skip.cpp Adding tool-invoked_fence function. Note that fence function is is doing global fencing, and that the end_parallel_* will need a devID if doing per-device fencing in the future. --- common/kokkos-sampler/kp_sampler_skip.cpp | 76 +++++++++++++---------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 03f6c62c1..6510e7d4f 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -162,15 +162,16 @@ void kokkosp_finalize_library() { void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - - if (((*kID) % kernelSampleSkip) == 0) { + *kID = 0; + static uint64_t invocationNum; + ++invocationNum; + if ((invocationNum % kernelSampleSkip) == 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (tool_globFence > 0) { - // invoke tool-induced fence from Kokkos_C_Profiling_interface + Kokkos::Experimental::Impl::tool_invoked_fence(0); } - + *kID = 1; // set kernel ID to 1 so that it is matched with the end_parallel_* if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -183,17 +184,16 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, } void kokkosp_end_parallel_for(const uint64_t kID) { - if ((kID % kernelSampleSkip) == 0) { + if (kID > 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + Kokkos::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } - if (NULL != endForCallee) { (*endForCallee)(kID); } @@ -202,18 +202,24 @@ void kokkosp_end_parallel_for(const uint64_t kID) { void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - if (((*kID) % kernelSampleSkip) == 0) { + + *kID = 0; // set memory location value of kID to 0. + static uint64_t invocationNum; + ++invocationNum; + if ((invocationNum % kernelSampleSkip) == 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + if (0 < tool_globFence) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + Kokkos::Experimental::Impl::tool_invoked_fence(0); } + *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); } - if (NULL != beginScanCallee) { (*beginScanCallee)(name, devID, kID); } @@ -221,60 +227,66 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, } void kokkosp_end_parallel_scan(const uint64_t kID) { - if ((kID % kernelSampleSkip) == 0) { + + if (kID > 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + Kokkos::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } - if (NULL != endScanCallee) { (*endScanCallee)(kID); } - } + } // end kID sample } void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; - - if (((*kID) % kernelSampleSkip) == 0) { + *kID = 0; + static uint64_t invocationNum; + ++invocationNum; + if ((invocationNum % kernelSampleSkip) == 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { - // TODO:invoke tool-induced fence from Kokkos_C_Profiling_interface + if (0 < tool_globFence) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + Kokkos::Experimental::Impl::tool_invoked_fence(0); } + *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); - } - + } if (NULL != beginReduceCallee) { (*beginReduceCallee)(name, devID, kID); } - } + } } -void kokkosp_end_parallel_reduce(const uint64_t kID) { - if ((kID % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate +void kokkosp_end_parallel_reduce(const uint64_t kID) { + if (kID > 0) { + getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { - // TODO: invoke tool-induced fence from Kokkos_C_Profiling_interface + if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck + Kokkos::profiling::impl::fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } - if (NULL != endReduceCallee) { (*endReduceCallee)(kID); } - } + } // end kID sample } } // namespace Sampler From 109843a582223cebf72098c5bafb43824be3fcf3 Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Thu, 18 May 2023 10:58:39 -0700 Subject: [PATCH 04/29] putting in sampler skip --- common/kokkos-sampler/kp_sampler_skip.cpp | 67 +++++++++++------------ example/kernels.hpp | 4 ++ profiling/all/CMakeLists.txt | 2 +- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 6510e7d4f..c329772c2 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -92,9 +92,7 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, printf("KokkosP: Next library to call: %s\n", nextLibrary); printf("KokkosP: Loading child library ..\n"); } - void* childLibrary = dlopen(nextLibrary, RTLD_NOW | RTLD_GLOBAL); - if (NULL == childLibrary) { fprintf(stderr, "KokkosP: Error: Unable to load: %s (Error=%s)\n", nextLibrary, dlerror()); @@ -143,9 +141,7 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, } free(envBuffer); - - uniqID = 1; - + uniqID = 1; const char* tool_sample = getenv("KOKKOS_TOOLS_SAMPLER_SKIP"); if (NULL != tool_sample) { kernelSampleSkip = atoi(tool_sample) + 1; @@ -169,9 +165,10 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (tool_globFence > 0) { - Kokkos::Experimental::Impl::tool_invoked_fence(0); + Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } - *kID = 1; // set kernel ID to 1 so that it is matched with the end_parallel_* + *kID = + 1; // set kernel ID to 1 so that it is matched with the end_parallel_* if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -188,7 +185,7 @@ void kokkosp_end_parallel_for(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - Kokkos::Experimental::Impl::tool_invoked_fence(0); + Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -202,20 +199,19 @@ void kokkosp_end_parallel_for(const uint64_t kID) { void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - - *kID = 0; // set memory location value of kID to 0. + *kID = 0; // set memory location value of kID to 0. static uint64_t invocationNum; - ++invocationNum; + ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. - Kokkos::Experimental::Impl::tool_invoked_fence(0); + if (0 < tool_globFence) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } - *kID = 1; // set kernel ID to 1 so that it is matched with the end. + *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -227,15 +223,14 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, } void kokkosp_end_parallel_scan(const uint64_t kID) { - if (kID > 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. - Kokkos::Experimental::Impl::tool_invoked_fence(0); + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -244,7 +239,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { if (NULL != endScanCallee) { (*endScanCallee)(kID); } - } // end kID sample + } // end kID sample } void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, @@ -255,29 +250,29 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, if ((invocationNum % kernelSampleSkip) == 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. - Kokkos::Experimental::Impl::tool_invoked_fence(0); + if (0 < tool_globFence) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } - *kID = 1; // set kernel ID to 1 so that it is matched with the end. + *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); - } + } if (NULL != beginReduceCallee) { (*beginReduceCallee)(name, devID, kID); } - } + } } -void kokkosp_end_parallel_reduce(const uint64_t kID) { +void kokkosp_end_parallel_reduce(const uint64_t kID) { if (kID > 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - Kokkos::profiling::impl::fence(0); + if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck + Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -286,7 +281,7 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { if (NULL != endReduceCallee) { (*endReduceCallee)(kID); } - } // end kID sample + } // end kID sample } } // namespace Sampler diff --git a/example/kernels.hpp b/example/kernels.hpp index 924a0e19e..ba73393fb 100644 --- a/example/kernels.hpp +++ b/example/kernels.hpp @@ -9,6 +9,8 @@ int run_calculation(const data_type SIZE) { Kokkos::View data(Kokkos::ViewAllocateWithoutInitializing("data"), SIZE); +for (int tstep = 0; tstep < 1000; tstep++) +{ Kokkos::parallel_for( "initialize()", SIZE, KOKKOS_LAMBDA(data_type i) { data(i) = i; }); Kokkos::fence(); @@ -30,6 +32,8 @@ int run_calculation(const data_type SIZE) { return 1; } std::cout << "Result OK: S(" << SIZE << ") = " << sum << std::endl; +} // end timestep loop + return 0; } diff --git a/profiling/all/CMakeLists.txt b/profiling/all/CMakeLists.txt index ce8b13e27..786f2c2a5 100644 --- a/profiling/all/CMakeLists.txt +++ b/profiling/all/CMakeLists.txt @@ -19,4 +19,4 @@ endif() file(GLOB_RECURSE HEADER_FILES CONFIGURE_DEPENDS kp_all.hpp "${COMMON_HEADERS_PATH}/*.hpp") install(FILES ${HEADER_FILES} DESTINATION ${EXPORT_INCLUDE_DIR}) -install(TARGETS ${LIBNAME} EXPORT ${EXPORT_NAME}) \ No newline at end of file +install(TARGETS ${LIBNAME} EXPORT ${EXPORT_NAME}) From 9e766615e7f7dcc40757605e1f2e23c5303733c4 Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Wed, 31 May 2023 22:21:58 -0700 Subject: [PATCH 05/29] putting in new sampler with appropriate functions. Still need to get device ID for fence (which could be obtained through the space handle). --- common/kokkos-sampler/kp_sampler_skip.cpp | 24 +++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index c329772c2..6ec9b24f0 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -6,12 +6,14 @@ #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" +using tpi = Kokkos::Tools::Experimental::Kokkos_Tools_ToolProgrammingInterface; namespace KokkosTools { namespace Sampler { static uint64_t uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; +tpi mytpi; typedef void (*initFunction)(const int, const uint64_t, const uint32_t, void*); typedef void (*finalizeFunction)(); @@ -37,6 +39,12 @@ void getGlobFenceChoice() { } } +// void kokkosp_tool_invoked_fence(const uint32_t, Kokkos_Tools_SpaceHandle* myspchandle, Kokkos_Tools_toolInvokedFenceFunction tool_fence) +// { + // (*tool_fence)(myspchandle, ); +// } + + void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings* settings) { if (0 == tool_globFence) { @@ -165,7 +173,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (tool_globFence > 0) { - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end_parallel_* @@ -185,7 +193,7 @@ void kokkosp_end_parallel_for(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -209,7 +217,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + myfence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -230,7 +238,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -254,7 +262,9 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); + + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -272,7 +282,9 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + // Kokkos::Tools::SpaceHandle::Kokkos_Profiling_SpaceHandle mysphndle; + mytpi.fence(0); // TODO: get spacehandle to identify where to fence. } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", From 2e97fa67c5ebd082d43ef469e546761c7364ef10 Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Wed, 31 May 2023 22:22:46 -0700 Subject: [PATCH 06/29] putting in new sampler with appropriate functions. Still need to get device ID for fence (which could be obtained through the space handle). Fixing with formatting. --- common/kokkos-sampler/kp_sampler_skip.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 6ec9b24f0..606c5f2ee 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -39,12 +39,12 @@ void getGlobFenceChoice() { } } -// void kokkosp_tool_invoked_fence(const uint32_t, Kokkos_Tools_SpaceHandle* myspchandle, Kokkos_Tools_toolInvokedFenceFunction tool_fence) +// void kokkosp_tool_invoked_fence(const uint32_t, Kokkos_Tools_SpaceHandle* +// myspchandle, Kokkos_Tools_toolInvokedFenceFunction tool_fence) // { - // (*tool_fence)(myspchandle, ); +// (*tool_fence)(myspchandle, ); // } - void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings* settings) { if (0 == tool_globFence) { @@ -173,7 +173,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (tool_globFence > 0) { - mytpi.fence(0); + mytpi.fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end_parallel_* @@ -217,7 +217,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - myfence(0); + myfence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -238,7 +238,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - mytpi.fence(0); + mytpi.fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -262,9 +262,9 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - mytpi.fence(0); - - // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); + + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -282,9 +282,9 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); // Kokkos::Tools::SpaceHandle::Kokkos_Profiling_SpaceHandle mysphndle; - mytpi.fence(0); // TODO: get spacehandle to identify where to fence. + mytpi.fence(0); // TODO: get spacehandle to identify where to fence. } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", From 8c5e5239cc6d27ad96c44c7f5c09ff885317c371 Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Wed, 31 May 2023 23:01:50 -0700 Subject: [PATCH 07/29] putting in new sampler with appropriate functions, using profiling interface function. Still need to get device ID for fence (which could be obtained through the space handle). Fixing with formatting. --- common/kokkos-sampler/kp_sampler_skip.cpp | 66 ++++++++++++----------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 606c5f2ee..2ed86bf68 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -6,14 +6,18 @@ #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" -using tpi = Kokkos::Tools::Experimental::Kokkos_Tools_ToolProgrammingInterface; +// using mytpi_type = Kokkos::Tools::Experimental::ToolProgrammingInterface; + +// mytpi_type mytpi; + namespace KokkosTools { namespace Sampler { static uint64_t uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; -tpi mytpi; +// mytpi_type mytpi; +Kokkos::Tools::Experimental::ToolProgrammingInterface mytpi; typedef void (*initFunction)(const int, const uint64_t, const uint32_t, void*); typedef void (*finalizeFunction)(); @@ -34,15 +38,14 @@ void getGlobFenceChoice() { const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); if (NULL != tool_globFence_str) { tool_globFence = atoi(tool_globFence_str); - } else { - tool_globFence = 0; - } + } // else + // tool_globFence = 0; } // void kokkosp_tool_invoked_fence(const uint32_t, Kokkos_Tools_SpaceHandle* // myspchandle, Kokkos_Tools_toolInvokedFenceFunction tool_fence) -// { -// (*tool_fence)(myspchandle, ); +//{ +//(*tool_fence)(myspchandle, ); // } void kokkosp_request_tool_settings(const uint32_t, @@ -112,6 +115,14 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_scan"); beginReduceCallee = (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_reduce"); + } + else { + beginForCallee = + (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_for"); + beginScanCallee = + (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_scan"); + beginReduceCallee = + (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_reduce"); endScanCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_scan"); @@ -207,28 +218,18 @@ void kokkosp_end_parallel_for(const uint64_t kID) { void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = 0; // set memory location value of kID to 0. - static uint64_t invocationNum; - ++invocationNum; - if ((invocationNum % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate - // value - if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. - myfence(0); - } - *kID = 1; // set kernel ID to 1 so that it is matched with the end. - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-begin function...\n", - (unsigned long long)(*kID)); - } - if (NULL != beginScanCallee) { - (*beginScanCallee)(name, devID, kID); - } - } + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a global (device 0 invoked) fence. + mytpi.fence(0); + + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); } +if (NULL != beginScanCallee) { + (*beginScanCallee)(name, devID, kID); +} +} // namespace Sampler +} // namespace KokkosTools void kokkosp_end_parallel_scan(const uint64_t kID) { if (kID > 0) { @@ -263,8 +264,6 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. mytpi.fence(0); - - // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -282,9 +281,8 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); // Kokkos::Tools::SpaceHandle::Kokkos_Profiling_SpaceHandle mysphndle; - mytpi.fence(0); // TODO: get spacehandle to identify where to fence. } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -302,7 +300,11 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { extern "C" { namespace impl = KokkosTools::Sampler; +// Expose Kokkos TPI +// +// Kokkos_Tools_provideToolProgrammingInterfaceFunction; +// EXPOSE_TOOL_INTERFACE(impl::kokkosp_provideToolsProgrammingInterface) EXPOSE_TOOL_SETTINGS(impl::kokkosp_request_tool_settings) EXPOSE_INIT(impl::kokkosp_init_library) EXPOSE_FINALIZE(impl::kokkosp_finalize_library) From 2c1efacf2b9c9702dceb70f9d13819100d7a783c Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 1 Jun 2023 10:00:19 -0700 Subject: [PATCH 08/29] putting in fix to sampler --- common/kokkos-sampler/kp_sampler_skip.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index c329772c2..297dffa70 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -6,12 +6,14 @@ #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" +using mytpi_type = Kokkos::Tools::Experimental::ToolProgrammingInterface; namespace KokkosTools { namespace Sampler { static uint64_t uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; +mytpi_type mytpi; typedef void (*initFunction)(const int, const uint64_t, const uint32_t, void*); typedef void (*finalizeFunction)(); @@ -165,7 +167,8 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (tool_globFence > 0) { - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end_parallel_* @@ -185,7 +188,8 @@ void kokkosp_end_parallel_for(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -209,7 +213,8 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + //Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -230,7 +235,8 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + mytpi.fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -254,7 +260,9 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + + mytpi.fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -272,7 +280,9 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); + + mytpi.fence(0); + // Kokkos::Tools::Experimental::Impl::tool_invoked_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", From 9c02dda5da41bd2c2f30785d36dbe8906901dc8c Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 1 Jun 2023 11:19:23 -0700 Subject: [PATCH 09/29] committing sampler formatted --- common/kokkos-sampler/kp_sampler_skip.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index c277246ca..6a94522e7 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -13,7 +13,7 @@ static uint64_t uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; -mytpi_type mytpi; +mytpi_type mytpi; typedef void (*initFunction)(const int, const uint64_t, const uint32_t, void*); typedef void (*finalizeFunction)(); @@ -226,7 +226,6 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, } } - void kokkosp_end_parallel_scan(const uint64_t kID) { if (kID > 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate @@ -259,8 +258,8 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - mytpi.fence(0); - } + mytpi.fence(0); + } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", @@ -276,8 +275,8 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { if (kID > 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - mytpi.fence(0); + if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck + mytpi.fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", From 8264f55170f650453c24ae31e36ed46ef70774b7 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 8 Jun 2023 09:55:31 -0700 Subject: [PATCH 10/29] fixes to Kokkos_Tools_ToolProgrammingInterface invocation --- common/kokkos-sampler/kp_sampler_skip.cpp | 45 +++++++++++++++++------ profiling/all/kp_core.hpp | 6 +++ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 6a94522e7..8a9e9048b 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -3,17 +3,19 @@ #include #include #include +// #include #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" -using mytpi_type = Kokkos::Tools::Experimental::ToolProgrammingInterface; +// using Kokkos::Tools::Experimental; +// using mytpi_type = Kokkos::Tools::Experimental::ToolProgrammingInterface; namespace KokkosTools { namespace Sampler { static uint64_t uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; -mytpi_type mytpi; +// mytpi_type mytpi; typedef void (*initFunction)(const int, const uint64_t, const uint32_t, void*); typedef void (*finalizeFunction)(); @@ -44,6 +46,24 @@ void getGlobFenceChoice() { //(*tool_fence)(myspchandle, ); // } +// set of functions from Kokkos ToolProgrammingInterface (includes fence) +Kokkos::Tools::Experimental::ToolProgrammingInterface tpi_funcs; + +void invoke_ktools_fence(uint32_t devID) { + // assert( tpi_funcs ! = NULL) + if (tpi_funcs.fence != nullptr) { + tpi_funcs.fence(devID); + } else + printf( + "KokkosP: FATAL: Kokkos Tools Programming Interface's tool-invoked " + "Fence is NULL!\n"); +} + +void kokkosp_provide_tool_programming_interface( + uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { + tpi_funcs = *funcsFromTPI; +} + void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings* settings) { if (0 == tool_globFence) { @@ -168,8 +188,9 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, if ((invocationNum % kernelSampleSkip) == 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (tool_globFence > 0) { - mytpi.fence(0); + if (0 < tool_globFence) { + invoke_ktools_fence( + 0); // invoke tool-induced fence from device 0 for now } *kID = 1; // set kernel ID to 1 so that it is matched with the end_parallel_* @@ -189,7 +210,8 @@ void kokkosp_end_parallel_for(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { - mytpi.fence(0); + invoke_ktools_fence( + 0); // invoke tool-induced fence from device 0 for now } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -212,8 +234,8 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. - mytpi.fence(0); + // only invokes a global (device 0 invoked) fence + invoke_ktools_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -234,7 +256,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - mytpi.fence(0); + invoke_ktools_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -258,7 +280,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a global (device 0 invoked) fence. - mytpi.fence(0); + invoke_ktools_fence(0); } *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { @@ -276,7 +298,7 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck - mytpi.fence(0); + invoke_ktools_fence(0); } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -294,7 +316,8 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { extern "C" { namespace impl = KokkosTools::Sampler; - +EXPOSE_TOOL_PROGRAMMING_INTERFACE( + impl::kokkosp_provide_tool_programming_interface) EXPOSE_TOOL_SETTINGS(impl::kokkosp_request_tool_settings) EXPOSE_INIT(impl::kokkosp_init_library) diff --git a/profiling/all/kp_core.hpp b/profiling/all/kp_core.hpp index a1834159b..8441815db 100644 --- a/profiling/all/kp_core.hpp +++ b/profiling/all/kp_core.hpp @@ -55,6 +55,12 @@ using Kokkos::Tools::SpaceHandle; FUNC_NAME(num_actions, settings); \ } +#define EXPOSE_TOOL_PROGRAMMING_INTERFACE(FUNC_NAME) \ + __attribute__((weak)) void kokkosp_provide_tool_programming_interface( \ + const uint32_t num_actions, Kokkos_Tools_ToolProgrammingInterface* tool_funcs) { \ + FUNC_NAME(num_actions, tool_funcs); \ + } + #define EXPOSE_INIT(FUNC_NAME) \ __attribute__((weak)) void kokkosp_init_library( \ const int loadSeq, const uint64_t interfaceVer, \ From 85b4d1a7b94ecdcee5081f585928b0e496848e57 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 8 Jun 2023 16:42:59 -0700 Subject: [PATCH 11/29] formatted fix to kp sampler --- common/kokkos-sampler/kp_sampler_skip.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 8a9e9048b..00b881d4e 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -61,6 +61,12 @@ void invoke_ktools_fence(uint32_t devID) { void kokkosp_provide_tool_programming_interface( uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { + if (!num_funcs) { + if (tool_verbosity > 0) + printf( + "KokkosP: Note: Number of functions in Tools Programming Interface " + "is 0!\n"); + } tpi_funcs = *funcsFromTPI; } From 95c3af281b40e94048c99d7b40ed596c69b371fc Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 8 Jun 2023 16:48:51 -0700 Subject: [PATCH 12/29] committed formatted kp_core --- profiling/all/kp_core.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/profiling/all/kp_core.hpp b/profiling/all/kp_core.hpp index 8441815db..587f89e1e 100644 --- a/profiling/all/kp_core.hpp +++ b/profiling/all/kp_core.hpp @@ -55,10 +55,11 @@ using Kokkos::Tools::SpaceHandle; FUNC_NAME(num_actions, settings); \ } -#define EXPOSE_TOOL_PROGRAMMING_INTERFACE(FUNC_NAME) \ - __attribute__((weak)) void kokkosp_provide_tool_programming_interface( \ - const uint32_t num_actions, Kokkos_Tools_ToolProgrammingInterface* tool_funcs) { \ - FUNC_NAME(num_actions, tool_funcs); \ +#define EXPOSE_TOOL_PROGRAMMING_INTERFACE(FUNC_NAME) \ + __attribute__((weak)) void kokkosp_provide_tool_programming_interface( \ + const uint32_t num_actions, \ + Kokkos_Tools_ToolProgrammingInterface* tool_funcs) { \ + FUNC_NAME(num_actions, tool_funcs); \ } #define EXPOSE_INIT(FUNC_NAME) \ From 5ee1a1f226a74f268d3fb21a56bfeb02b3a1c4bf Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 8 Jun 2023 16:52:25 -0700 Subject: [PATCH 13/29] formatted kernels.hpp --- example/kernels.hpp | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/example/kernels.hpp b/example/kernels.hpp index ba73393fb..c25088e70 100644 --- a/example/kernels.hpp +++ b/example/kernels.hpp @@ -9,30 +9,29 @@ int run_calculation(const data_type SIZE) { Kokkos::View data(Kokkos::ViewAllocateWithoutInitializing("data"), SIZE); -for (int tstep = 0; tstep < 1000; tstep++) -{ - Kokkos::parallel_for( - "initialize()", SIZE, KOKKOS_LAMBDA(data_type i) { data(i) = i; }); - Kokkos::fence(); + for (int tstep = 0; tstep < 1000; tstep++) { + Kokkos::parallel_for( + "initialize()", SIZE, KOKKOS_LAMBDA(data_type i) { data(i) = i; }); + Kokkos::fence(); - data_type sum = 0; - Kokkos::parallel_reduce( - "accumulate()", SIZE, - KOKKOS_LAMBDA(data_type i, data_type & lsum) { lsum += 1 + data(i); }, - sum); - Kokkos::fence(); + data_type sum = 0; + Kokkos::parallel_reduce( + "accumulate()", SIZE, + KOKKOS_LAMBDA(data_type i, data_type & lsum) { lsum += 1 + data(i); }, + sum); + Kokkos::fence(); - Kokkos::Profiling::popRegion(); + Kokkos::Profiling::popRegion(); - // check results - const data_type check = (SIZE + 1) * SIZE / 2; - if (sum != check) { - std::cout << "BAD result, got S(" << SIZE << ") = " << sum << " (expected " - << check << ")" << std::endl; - return 1; - } - std::cout << "Result OK: S(" << SIZE << ") = " << sum << std::endl; -} // end timestep loop + // check results + const data_type check = (SIZE + 1) * SIZE / 2; + if (sum != check) { + std::cout << "BAD result, got S(" << SIZE << ") = " << sum + << " (expected " << check << ")" << std::endl; + return 1; + } + std::cout << "Result OK: S(" << SIZE << ") = " << sum << std::endl; + } // end timestep loop return 0; } From 20e5a3027d48d7403671e0991f891db8efcdf2ac Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 20 Jul 2023 16:32:15 -0700 Subject: [PATCH 14/29] Update common/kokkos-sampler/kp_sampler_skip.cpp Co-authored-by: Daniel Arndt --- common/kokkos-sampler/kp_sampler_skip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 00b881d4e..99a22c1b7 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -31,7 +31,7 @@ static endFunction endForCallee = NULL; static endFunction endScanCallee = NULL; static endFunction endReduceCallee = NULL; -void getGlobFenceChoice() { +void get_global_fence_choice() { // re-read environment variable to get most accurate value const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); if (NULL != tool_globFence_str) { From 758f3f9180ed251af778f9e27fd52b2991e3bf4f Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 20 Jul 2023 17:05:36 -0700 Subject: [PATCH 15/29] Removing all commented code Removing all commented code (which is no longer needed) as requested. --- common/kokkos-sampler/kp_sampler_skip.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 99a22c1b7..3c77f3612 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -3,19 +3,15 @@ #include #include #include -// #include #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" -// using Kokkos::Tools::Experimental; -// using mytpi_type = Kokkos::Tools::Experimental::ToolProgrammingInterface; namespace KokkosTools { namespace Sampler { static uint64_t uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; -// mytpi_type mytpi; typedef void (*initFunction)(const int, const uint64_t, const uint32_t, void*); typedef void (*finalizeFunction)(); @@ -40,17 +36,11 @@ void get_global_fence_choice() { // tool_globFence = 0; } -// void kokkosp_tool_invoked_fence(const uint32_t, Kokkos_Tools_SpaceHandle* -// myspchandle, Kokkos_Tools_toolInvokedFenceFunction tool_fence) -//{ -//(*tool_fence)(myspchandle, ); -// } // set of functions from Kokkos ToolProgrammingInterface (includes fence) Kokkos::Tools::Experimental::ToolProgrammingInterface tpi_funcs; void invoke_ktools_fence(uint32_t devID) { - // assert( tpi_funcs ! = NULL) if (tpi_funcs.fence != nullptr) { tpi_funcs.fence(devID); } else @@ -303,7 +293,7 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { if (kID > 0) { getGlobFenceChoice(); // re-read environment variable to get most accurate // value - if (0 < tool_globFence) { // Todo: see if this is a performance bottleneck + if (0 < tool_globFence) { invoke_ktools_fence(0); } if (tool_verbosity > 0) { From 91a48f5a0e825b31808f7437e52af08c292dbdec Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Thu, 20 Jul 2023 18:42:14 -0700 Subject: [PATCH 16/29] fix name of glob fence choice --- common/kokkos-sampler/kp_sampler_skip.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 3c77f3612..1b18a46fd 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -32,8 +32,7 @@ void get_global_fence_choice() { const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); if (NULL != tool_globFence_str) { tool_globFence = atoi(tool_globFence_str); - } // else - // tool_globFence = 0; + } } @@ -87,7 +86,7 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, char* profileLibrary = getenv("KOKKOS_TOOLS_LIBS"); if (NULL == profileLibrary) { printf( - "Checking KOKKOS_PROFILE_LIBRARY. WARNING: This is a depreciated " + "KokkosP: Checking KOKKOS_PROFILE_LIBRARY. WARNING: This is a deprecated " "variable. Please use KOKKOS_TOOLS_LIBS\n"); profileLibrary = getenv("KOKKOS_PROFILE_LIBRARY"); if (NULL == profileLibrary) { @@ -182,7 +181,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + get_global_fence_choice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { invoke_ktools_fence( @@ -203,7 +202,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, void kokkosp_end_parallel_for(const uint64_t kID) { if (kID > 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + get_global_fence_choice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { invoke_ktools_fence( @@ -225,7 +224,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + get_global_fence_choice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than @@ -246,7 +245,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, void kokkosp_end_parallel_scan(const uint64_t kID) { if (kID > 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + get_global_fence_choice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than @@ -270,7 +269,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + get_global_fence_choice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than @@ -291,7 +290,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, void kokkosp_end_parallel_reduce(const uint64_t kID) { if (kID > 0) { - getGlobFenceChoice(); // re-read environment variable to get most accurate + get_global_fence_choice(); // re-read environment variable to get most accurate // value if (0 < tool_globFence) { invoke_ktools_fence(0); From a6c7bc1c7c815d364f7ff22798d34f998847b402 Mon Sep 17 00:00:00 2001 From: Vivek Kale Date: Thu, 20 Jul 2023 19:01:40 -0700 Subject: [PATCH 17/29] applied clang format --- common/kokkos-sampler/kp_sampler_skip.cpp | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 1b18a46fd..c338ef7dd 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -32,10 +32,9 @@ void get_global_fence_choice() { const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); if (NULL != tool_globFence_str) { tool_globFence = atoi(tool_globFence_str); - } + } } - // set of functions from Kokkos ToolProgrammingInterface (includes fence) Kokkos::Tools::Experimental::ToolProgrammingInterface tpi_funcs; @@ -86,7 +85,8 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, char* profileLibrary = getenv("KOKKOS_TOOLS_LIBS"); if (NULL == profileLibrary) { printf( - "KokkosP: Checking KOKKOS_PROFILE_LIBRARY. WARNING: This is a deprecated " + "KokkosP: Checking KOKKOS_PROFILE_LIBRARY. WARNING: This is a " + "deprecated " "variable. Please use KOKKOS_TOOLS_LIBS\n"); profileLibrary = getenv("KOKKOS_PROFILE_LIBRARY"); if (NULL == profileLibrary) { @@ -181,8 +181,8 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - get_global_fence_choice(); // re-read environment variable to get most accurate - // value + get_global_fence_choice(); // re-read environment variable to get most + // accurate value if (0 < tool_globFence) { invoke_ktools_fence( 0); // invoke tool-induced fence from device 0 for now @@ -202,8 +202,8 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, void kokkosp_end_parallel_for(const uint64_t kID) { if (kID > 0) { - get_global_fence_choice(); // re-read environment variable to get most accurate - // value + get_global_fence_choice(); // re-read environment variable to get most + // accurate value if (0 < tool_globFence) { invoke_ktools_fence( 0); // invoke tool-induced fence from device 0 for now @@ -224,8 +224,8 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - get_global_fence_choice(); // re-read environment variable to get most accurate - // value + get_global_fence_choice(); // re-read environment variable to get most + // accurate value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function @@ -245,8 +245,8 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, void kokkosp_end_parallel_scan(const uint64_t kID) { if (kID > 0) { - get_global_fence_choice(); // re-read environment variable to get most accurate - // value + get_global_fence_choice(); // re-read environment variable to get most + // accurate value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function @@ -269,8 +269,8 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - get_global_fence_choice(); // re-read environment variable to get most accurate - // value + get_global_fence_choice(); // re-read environment variable to get most + // accurate value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function @@ -290,9 +290,9 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, void kokkosp_end_parallel_reduce(const uint64_t kID) { if (kID > 0) { - get_global_fence_choice(); // re-read environment variable to get most accurate - // value - if (0 < tool_globFence) { + get_global_fence_choice(); // re-read environment variable to get most + // accurate value + if (0 < tool_globFence) { invoke_ktools_fence(0); } if (tool_verbosity > 0) { From eda9d8342ae34a868adc656ca1c34e1b5bd6e28f Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:10:54 -0700 Subject: [PATCH 18/29] fixing with correct device id --- common/kokkos-sampler/kp_sampler_skip.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index c338ef7dd..2cda0897b 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -47,6 +47,17 @@ void invoke_ktools_fence(uint32_t devID) { "Fence is NULL!\n"); } +uint32_t getDeviceID(uint32_t devid_in) +{ + + int num_device_bits = 7; + int num_instance_bits = 17; + + return (~((uint32_t(-1)) << num_device_bits)) & + (devid_in >> num_instance_bits); + +} + void kokkosp_provide_tool_programming_interface( uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { if (!num_funcs) { @@ -195,7 +206,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, } if (NULL != beginForCallee) { - (*beginForCallee)(name, devID, kID); + (*beginForCallee)(name, getDeviceID(devID), kID); } } } @@ -238,7 +249,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginScanCallee) { - (*beginScanCallee)(name, devID, kID); + (*beginScanCallee)(name, getDeviceID(devID), kID); } } } @@ -283,7 +294,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginReduceCallee) { - (*beginReduceCallee)(name, devID, kID); + (*beginReduceCallee)(name, getDeviceID(devID), kID); } } } From 5873f0aae31c9ec69c4116bc1823f9a902342e7c Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:16:38 -0700 Subject: [PATCH 19/29] clang format for fix with device ID --- common/kokkos-sampler/kp_sampler_skip.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 2cda0897b..06a188fdf 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -47,15 +47,12 @@ void invoke_ktools_fence(uint32_t devID) { "Fence is NULL!\n"); } -uint32_t getDeviceID(uint32_t devid_in) -{ - - int num_device_bits = 7; - int num_instance_bits = 17; - - return (~((uint32_t(-1)) << num_device_bits)) & - (devid_in >> num_instance_bits); +uint32_t getDeviceID(uint32_t devid_in) { + int num_device_bits = 7; + int num_instance_bits = 17; + return (~((uint32_t(-1)) << num_device_bits)) & + (devid_in >> num_instance_bits); } void kokkosp_provide_tool_programming_interface( From 874ad93d58f7daa0c72fdc45c61c8ca8e14c240d Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:58:06 -0700 Subject: [PATCH 20/29] adding deep copy and fence callback in sampler --- common/kokkos-sampler/kp_sampler_skip.cpp | 100 +++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 06a188fdf..a9789b12d 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -23,10 +23,13 @@ static finalizeFunction finalizeProfileLibrary = NULL; static beginFunction beginForCallee = NULL; static beginFunction beginScanCallee = NULL; static beginFunction beginReduceCallee = NULL; +static beginFunction beginFenceCallee = NULL; +static beginFunction beginDeepCopyCallee = NULL; static endFunction endForCallee = NULL; static endFunction endScanCallee = NULL; static endFunction endReduceCallee = NULL; - +static endFunction endFenceCallee = NULL; +static endFunction endDeepCopyCallee = NULL; void get_global_fence_choice() { // re-read environment variable to get most accurate value const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); @@ -134,13 +137,20 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_scan"); beginReduceCallee = (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_reduce"); - + beginFenceCallee = + (beginFunction)dlsym(childLibrary, "kokkosp_begin_fence"); + beginDeepCopyCallee = + (beginFunction)dlsym(childLibrary, "kokkosp_begin_deep_copy"); endScanCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_scan"); endForCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_for"); endReduceCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_reduce"); + endFenceCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_fence"); + endDeepCopyCallee = + (endFunction)dlsym(childLibrary, "kokkosp_end_deep_copy"); + initProfileLibrary = (initFunction)dlsym(childLibrary, "kokkosp_init_library"); finalizeProfileLibrary = @@ -313,6 +323,92 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { } // end kID sample } +void kokkosp_begin_fence(const char* name, const uint32_t devID, + uint64_t* kID) { + *kID = 0; + static uint64_t invocationNum; + ++invocationNum; + if ((invocationNum % kernelSampleSkip) == 0) { + get_global_fence_choice(); // re-read environment variable to get most + // accurate value + if (0 < tool_globFence) { + invoke_ktools_fence( + 0); // invoke tool-induced fence from device 0 for now + } + *kID = + 1; // set kernel ID to 1 so that it is matched with the end_parallel_* + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); + } + + if (NULL != beginFenceCallee) { + (*beginFenceCallee)(name, getDeviceID(devID), kID); + } + } +} + +void kokkosp_end_fence(const uint64_t kID) { + if (kID > 0) { + get_global_fence_choice(); // re-read environment variable to get most + // accurate value + if (0 < tool_globFence) { + invoke_ktools_fence( + 0); // invoke tool-induced fence from device 0 for now + } + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-end function...\n", + (unsigned long long)(kID)); + } + if (NULL != endFenceCallee) { + (*endFenceCallee)(kID); + } + } +} + +void kokkosp_begin_deep_copy(const char* name, const uint32_t devID, + uint64_t* kID) { + *kID = 0; + static uint64_t invocationNum; + ++invocationNum; + if ((invocationNum % kernelSampleSkip) == 0) { + get_global_fence_choice(); // re-read environment variable to get most + // accurate value + if (0 < tool_globFence) { + invoke_ktools_fence( + 0); // invoke tool-induced fence from device 0 for now + } + *kID = + 1; // set kernel ID to 1 so that it is matched with the end_parallel_* + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-begin function...\n", + (unsigned long long)(*kID)); + } + + if (NULL != beginDeepCopyCallee) { + (*beginDeepCopyCallee)(name, getDeviceID(devID), kID); + } + } +} + +void kokkosp_end_deep_copy(const uint64_t kID) { + if (kID > 0) { + get_global_fence_choice(); // re-read environment variable to get most + // accurate value + if (0 < tool_globFence) { + invoke_ktools_fence( + 0); // invoke tool-induced fence from device 0 for now + } + if (tool_verbosity > 0) { + printf("KokkosP: sample %llu calling child-end function...\n", + (unsigned long long)(kID)); + } + if (NULL != endDeepCopyCallee) { + (*endDeepCopyCallee)(kID); + } + } +} + } // namespace Sampler } // end namespace KokkosTools From a76e1ac2da3e5a8bfcf0ee18c9ffc3c2dd02fc6f Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 3 Aug 2023 10:20:35 -0700 Subject: [PATCH 21/29] Update kp_sampler_skip.cpp --- common/kokkos-sampler/kp_sampler_skip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index a9789b12d..0f343e44e 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -256,7 +256,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginScanCallee) { - (*beginScanCallee)(name, getDeviceID(devID), kID); + (*beginScanCallee)(name,devID, kID); } } } @@ -301,7 +301,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginReduceCallee) { - (*beginReduceCallee)(name, getDeviceID(devID), kID); + (*beginReduceCallee)(name, devID, kID); } } } @@ -343,7 +343,7 @@ void kokkosp_begin_fence(const char* name, const uint32_t devID, } if (NULL != beginFenceCallee) { - (*beginFenceCallee)(name, getDeviceID(devID), kID); + (*beginFenceCallee)(name, devID, kID); } } } From fd01d940a713cde920fdf4f0427179a6f911fc43 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 3 Aug 2023 11:24:43 -0700 Subject: [PATCH 22/29] add atomic for uniqID More with hashmap and nestedID coming --- common/kokkos-sampler/kp_sampler_skip.cpp | 49 ++++++++++++++--------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 0f343e44e..20b629913 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -5,10 +5,10 @@ #include #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" - +#include namespace KokkosTools { namespace Sampler { -static uint64_t uniqID = 0; +static atomic uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; @@ -167,12 +167,20 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (beginScanCallee == NULL) ? "no" : "yes"); printf("KokkosP: begin-parallel-reduce: %s\n", (beginReduceCallee == NULL) ? "no" : "yes"); + printf("KokkosP: begin-deep-copy: %s\n", + (beginDeepCopyCallee == NULL) ? "no" : "yes"); + printf("KokkosP: begin-fence: %s\n", + (beginFenceCallee == NULL) ? "no" : "yes"); printf("KokkosP: end-parallel-for: %s\n", (endForCallee == NULL) ? "no" : "yes"); printf("KokkosP: end-parallel-scan: %s\n", (endScanCallee == NULL) ? "no" : "yes"); printf("KokkosP: end-parallel-reduce: %s\n", (endReduceCallee == NULL) ? "no" : "yes"); + printf("KokkosP: end-deep-copy: %s\n", + (endDeepCopyCallee == NULL) ? "no" : "yes"); + printf("KokkosP: end-fence: %s\n", + (endFenceCallee == NULL) ? "no" : "yes"); } } } @@ -195,7 +203,7 @@ void kokkosp_finalize_library() { void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = 0; + *kID = uniqID++; static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { @@ -213,13 +221,13 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, } if (NULL != beginForCallee) { - (*beginForCallee)(name, getDeviceID(devID), kID); + (*beginForCallee)(name, devID, kID); } } } void kokkosp_end_parallel_for(const uint64_t kID) { - if (kID > 0) { + if (kID > 0) { // check whether the corresponding begin parallel for was called // TODO: fix to hashmap get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -238,7 +246,7 @@ void kokkosp_end_parallel_for(const uint64_t kID) { void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = 0; // set memory location value of kID to 0. + *kID = uniqID++; // set memory location value of kID to uniqID static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { @@ -256,13 +264,13 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginScanCallee) { - (*beginScanCallee)(name,devID, kID); + (*beginScanCallee)(name, devID, kID); } } } void kokkosp_end_parallel_scan(const uint64_t kID) { - if (kID > 0) { + if (kID > 0) { // check that we match the begin scan kernel call get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -283,7 +291,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = 0; + *kID = uniqID++; static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { @@ -295,7 +303,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // only invokes a global (device 0 invoked) fence. invoke_ktools_fence(0); } - *kID = 1; // set kernel ID to 1 so that it is matched with the end. +// *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -325,7 +333,7 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { void kokkosp_begin_fence(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = 0; + *kID = uniqID++; static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { @@ -335,8 +343,6 @@ void kokkosp_begin_fence(const char* name, const uint32_t devID, invoke_ktools_fence( 0); // invoke tool-induced fence from device 0 for now } - *kID = - 1; // set kernel ID to 1 so that it is matched with the end_parallel_* if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -368,31 +374,29 @@ void kokkosp_end_fence(const uint64_t kID) { void kokkosp_begin_deep_copy(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = 0; + *kID = uniqID++; static uint64_t invocationNum; ++invocationNum; - if ((invocationNum % kernelSampleSkip) == 0) { + if ((invocationNum % kernelSampleSkip) == 0) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { invoke_ktools_fence( 0); // invoke tool-induced fence from device 0 for now - } - *kID = - 1; // set kernel ID to 1 so that it is matched with the end_parallel_* + } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); } if (NULL != beginDeepCopyCallee) { - (*beginDeepCopyCallee)(name, getDeviceID(devID), kID); + (*beginDeepCopyCallee)(name, devID, kID); } } } void kokkosp_end_deep_copy(const uint64_t kID) { - if (kID > 0) { + if (kID > 0) { // check that we match the begin deep copy kernel call get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -427,5 +431,10 @@ EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan) EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan) EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce) EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) +EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy) +EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy) +EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence) +EXPOSE_END_FENCE(impl::kokkosp_end_fence) + } // end extern "C" From 57f9a1fb6144e4dafb32e0158b93c60c7f841209 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Aug 2023 08:07:20 -0700 Subject: [PATCH 23/29] change atomic to std::atomic --- common/kokkos-sampler/kp_sampler_skip.cpp | 30 ++++++++++------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 20b629913..a76139e9b 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -8,7 +8,7 @@ #include namespace KokkosTools { namespace Sampler { -static atomic uniqID = 0; +static std:atomic uniqID = 0; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; @@ -210,29 +210,27 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - invoke_ktools_fence( - 0); // invoke tool-induced fence from device 0 for now + invoke_ktools_fence(0); // invoke tool-induced fence from device number 0 // TODO: use getDeviceID } - *kID = - 1; // set kernel ID to 1 so that it is matched with the end_parallel_* if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); } if (NULL != beginForCallee) { - (*beginForCallee)(name, devID, kID); + uint64_t* nestedkID = 0; + (*beginForCallee)(name, devID, kID); // TODO: replace kID with nestedkID + // map.insert(kID, nestedkID); } } } void kokkosp_end_parallel_for(const uint64_t kID) { - if (kID > 0) { // check whether the corresponding begin parallel for was called // TODO: fix to hashmap + if (kID == uniqID) { // check whether the corresponding begin parallel for was called // TODO: fix to hashmap get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - invoke_ktools_fence( - 0); // invoke tool-induced fence from device 0 for now + invoke_ktools_fence(0); // invoke tool-induced fence from device number } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -240,8 +238,9 @@ void kokkosp_end_parallel_for(const uint64_t kID) { } if (NULL != endForCallee) { (*endForCallee)(kID); + // (*endForCallee)(find(kID)); } - } + } // end uniqID sample match conditional } void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, @@ -255,10 +254,9 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence + // only invokes a fence on the device 0 invoke_ktools_fence(0); } - *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -270,7 +268,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, } void kokkosp_end_parallel_scan(const uint64_t kID) { - if (kID > 0) { // check that we match the begin scan kernel call + if (kID == uniqID) { // check that we match the begin scan kernel call get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -303,7 +301,6 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // only invokes a global (device 0 invoked) fence. invoke_ktools_fence(0); } -// *kID = 1; // set kernel ID to 1 so that it is matched with the end. if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -355,7 +352,7 @@ void kokkosp_begin_fence(const char* name, const uint32_t devID, } void kokkosp_end_fence(const uint64_t kID) { - if (kID > 0) { + if (kID == uniqID) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -396,7 +393,7 @@ void kokkosp_begin_deep_copy(const char* name, const uint32_t devID, } void kokkosp_end_deep_copy(const uint64_t kID) { - if (kID > 0) { // check that we match the begin deep copy kernel call + if (kID == uniqID) { // check that we match the begin deep copy kernel call get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -436,5 +433,4 @@ EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy) EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence) EXPOSE_END_FENCE(impl::kokkosp_end_fence) - } // end extern "C" From fa43e59213e2ed914fb5afcbb61dd482f6f66f55 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Aug 2023 14:33:01 -0700 Subject: [PATCH 24/29] committing fixed file for kp sampler, making thread safe and keeping track of nested (child) kernel ID on sample. --- common/kokkos-sampler/kp_sampler_skip.cpp | 395 ++++++++++++++-------- 1 file changed, 248 insertions(+), 147 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index a76139e9b..4aa470840 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -6,9 +6,19 @@ #include "../../profiling/all/kp_core.hpp" #include "kp_config.hpp" #include +#include +#include + +using namespace std; + namespace KokkosTools { namespace Sampler { -static std:atomic uniqID = 0; +static atomic uniqID = 0; +static mutex sampler_mtx; +// static tuple tupleofkID; +// static pair nestedkIDdevID; +static unordered_map> infokIDSample; +// static unordered_map nestedkIDofkID; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; @@ -23,13 +33,9 @@ static finalizeFunction finalizeProfileLibrary = NULL; static beginFunction beginForCallee = NULL; static beginFunction beginScanCallee = NULL; static beginFunction beginReduceCallee = NULL; -static beginFunction beginFenceCallee = NULL; -static beginFunction beginDeepCopyCallee = NULL; static endFunction endForCallee = NULL; static endFunction endScanCallee = NULL; static endFunction endReduceCallee = NULL; -static endFunction endFenceCallee = NULL; -static endFunction endDeepCopyCallee = NULL; void get_global_fence_choice() { // re-read environment variable to get most accurate value const char* tool_globFence_str = getenv("KOKKOS_TOOLS_GLOBALFENCES"); @@ -137,20 +143,12 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_scan"); beginReduceCallee = (beginFunction)dlsym(childLibrary, "kokkosp_begin_parallel_reduce"); - beginFenceCallee = - (beginFunction)dlsym(childLibrary, "kokkosp_begin_fence"); - beginDeepCopyCallee = - (beginFunction)dlsym(childLibrary, "kokkosp_begin_deep_copy"); - endScanCallee = - (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_scan"); endForCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_for"); + endScanCallee = + (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_scan"); endReduceCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_parallel_reduce"); - endFenceCallee = (endFunction)dlsym(childLibrary, "kokkosp_end_fence"); - endDeepCopyCallee = - (endFunction)dlsym(childLibrary, "kokkosp_end_deep_copy"); - initProfileLibrary = (initFunction)dlsym(childLibrary, "kokkosp_init_library"); finalizeProfileLibrary = @@ -167,20 +165,12 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, (beginScanCallee == NULL) ? "no" : "yes"); printf("KokkosP: begin-parallel-reduce: %s\n", (beginReduceCallee == NULL) ? "no" : "yes"); - printf("KokkosP: begin-deep-copy: %s\n", - (beginDeepCopyCallee == NULL) ? "no" : "yes"); - printf("KokkosP: begin-fence: %s\n", - (beginFenceCallee == NULL) ? "no" : "yes"); printf("KokkosP: end-parallel-for: %s\n", (endForCallee == NULL) ? "no" : "yes"); printf("KokkosP: end-parallel-scan: %s\n", (endScanCallee == NULL) ? "no" : "yes"); - printf("KokkosP: end-parallel-reduce: %s\n", + printf("KokkosP: end-parallel-reduce: %s\n", (endReduceCallee == NULL) ? "no" : "yes"); - printf("KokkosP: end-deep-copy: %s\n", - (endDeepCopyCallee == NULL) ? "no" : "yes"); - printf("KokkosP: end-fence: %s\n", - (endFenceCallee == NULL) ? "no" : "yes"); } } } @@ -207,208 +197,323 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { + // accurate value + pair infoOfSample = make_tuple(0, 0); + // infoOfSample = new pair(0, 0); + uint32_t devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most - // accurate value + // accurate if (0 < tool_globFence) { - invoke_ktools_fence(0); // invoke tool-induced fence from device number 0 // TODO: use getDeviceID + if (0 <= devNum) { + invoke_ktools_fence( + devNum); // invoke tool-induced fence from device number + } else { // device number is negative + if (tool_verbosity > 0) + printf( + "KokkosP: warning: device number obtained (%lu) from " + "sampler is negative. Tool-induced " + " fence called with argument 0. \n", + (unsigned long)devNum); + invoke_ktools_fence(0); + } } + infoOfSample.second = devID; + // devNumofkID.insert(devNum); + if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); } - if (NULL != beginForCallee) { uint64_t* nestedkID = 0; - (*beginForCallee)(name, devID, kID); // TODO: replace kID with nestedkID - // map.insert(kID, nestedkID); + (*beginForCallee)(name, devID, nestedkID); // replace kID with nestedkID + sampler_mtx.lock(); + infoOfSample.first = *nestedkID; + infokIDSample.insert({*kID, infoOfSample}); + sampler_mtx.unlock(); + } else { // no child to call + if (tool_verbosity > 1) { + printf( + "KokkosP: warning: sampler's begin_parallel_for callback has no " + "child" + " function to call.\n"); + } } - } + } // end sample gathering insert for parallel for } void kokkosp_end_parallel_for(const uint64_t kID) { - if (kID == uniqID) { // check whether the corresponding begin parallel for was called // TODO: fix to hashmap + pair infoOfMatchedSample; + sampler_mtx.lock(); + if (!(infokIDSample.find(kID) == + infokIDSample.end())) { // check that we match the begin parallel for + // kernel call + infoOfMatchedSample = infokIDSample.at(kID); + uint32_t devNum; + uint32_t devID; + devID = infoOfMatchedSample.second; + devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - invoke_ktools_fence(0); // invoke tool-induced fence from device number + if (0 <= devNum) { + invoke_ktools_fence( + devNum); // invoke tool-induced fence from device number + } else { // device number is negative + if (tool_verbosity > 0) { + printf( + "KokkosP: Warning: device number of sample's kernel ID is %lu " + "and is corrupted! Invoking global fence. \n", + (unsigned long)devNum); + } + invoke_ktools_fence(0); + } } if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } + if (NULL != endForCallee) { - (*endForCallee)(kID); - // (*endForCallee)(find(kID)); + // (*endForCallee)(kID); + uint64_t nestedkID = infoOfMatchedSample.first; + if (0 > nestedkID) { + printf( + "KokkosP: Warning: not calling endForCallee. sampler's child's kID " + "(nested " + " kID) is %llu - less than 0 - and hence is corrupted!\n", + (unsigned long long)nestedkID); + } else { + (*endForCallee)(nestedkID); + } + infokIDSample.erase(kID); + } else { + if (tool_verbosity > 1) + printf("KokkosP: Warning: sampler's endForCallee not found.\n"); } - } // end uniqID sample match conditional + } // end sampler gather for end_parallel_for + sampler_mtx.unlock(); } void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { - *kID = uniqID++; // set memory location value of kID to uniqID + *kID = uniqID++; // set memory location value of kID to uniqID static uint64_t invocationNum; ++invocationNum; + uint32_t devNum; if ((invocationNum % kernelSampleSkip) == 0) { + pair infoOfSample = make_tuple(0, 0); + devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function - // only invokes a fence on the device 0 - invoke_ktools_fence(0); + // only invokes a fence on the device of the devID passed + if (0 <= devNum) { + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { + printf( + "KokkosP: sampler begin_parallel_scan callback" + " invoked fence on device number %lu\n", + (unsigned long)devNum); + } + } else { // device obtained was negative + if (tool_verbosity > 0) + printf( + "KokkosP: warning: device number obtained was negative, " + "specifically %lu." + " Calling global tool induced fence.\n", + (unsigned long)devNum); + invoke_ktools_fence(0); + } } + infoOfSample.second = devID; if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); } if (NULL != beginScanCallee) { - (*beginScanCallee)(name, devID, kID); + uint64_t* nestedkID = 0; + (*beginScanCallee)(name, devID, nestedkID); + sampler_mtx.lock(); + infoOfSample.first = *nestedkID; + infokIDSample.insert({*kID, infoOfSample}); + sampler_mtx.unlock(); + } else { + if (tool_verbosity > 1) + printf( + "KokkosP: Warning: sampler has no beginScanCallee function " + "available to call.\n"); } - } -} + } // end sample gather scan +} // end begin scan callback void kokkosp_end_parallel_scan(const uint64_t kID) { - if (kID == uniqID) { // check that we match the begin scan kernel call + pair infoOfMatchedSample; + sampler_mtx.lock(); + if ((infokIDSample.find(kID) == + infokIDSample + .end())) { // check that we match the begin scan kernel call + infoOfMatchedSample = infokIDSample.at(kID); + uint32_t devNum; + // devNum = devNumofkID.find(kID); + uint32_t devID = infoOfMatchedSample.second; + devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. - invoke_ktools_fence(0); - } + if (devNum < 0) { + printf( + "KokkosP: Warning: global tool-induced fence for end_parallel_scan " + "invoked." + " Retrieved device ID of sampler" + " is %lu and hence is corrupted!\n", + (unsigned long)devNum); + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a fence on the device fenced on the begin parallel scan. + invoke_ktools_fence(0); + } else { // device is non-negative + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { + printf( + "KokkosP: sampler end_parallel_scan callback" + " invoked tool-induced fence on device %lu\n", + (unsigned long)devNum); + } + } + + } // end invoke fence conditional if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } if (NULL != endScanCallee) { - (*endScanCallee)(kID); + uint64_t nestedkID = infoOfMatchedSample.first; + if (nestedkID < 0) { + printf( + "KokkosP: Warning: sampler's child's kID (nested kID) is %llu - " + "less than 0 - and hence is corrupted!\n", + (unsigned long long)nestedkID); + } else + (*endScanCallee)(nestedkID); + infokIDSample.erase(kID); + } else { + if (tool_verbosity > 1) + printf( + "KokkosP: warning: sampler's end parallel scan has no " + "endScanCallee to call\n"); } } // end kID sample -} + sampler_mtx.unlock(); +} // end end_parallel_scan callback void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; static uint64_t invocationNum; ++invocationNum; + pair infoOfSample = make_tuple(0, 0); + uint32_t devNum; if ((invocationNum % kernelSampleSkip) == 0) { + devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value - if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a global (device 0 invoked) fence. + if (0 < devNum) { + if (0 < tool_globFence) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a fence on devNum of devID specified + invoke_ktools_fence(devNum); + } + } else { // device number is negative + if (tool_verbosity > 0) { + printf( + "KokkosP: warning: sampler begin_parallel_reduce obtained negative " + "dev number, i.e., %d \n", + devNum); + } invoke_ktools_fence(0); } + + infoOfSample.second = devID; + if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); } if (NULL != beginReduceCallee) { - (*beginReduceCallee)(name, devID, kID); + uint64_t* nestedkID = 0; + (*beginReduceCallee)(name, devID, nestedkID); + infoOfSample.first = *nestedkID; + sampler_mtx.lock(); + infokIDSample.insert({*kID, infoOfSample}); + sampler_mtx.unlock(); + } else { + if (tool_verbosity > 1) { + printf( + "KokkosP: Note: begin_parallel_reduce sampler " + " callback has no child-begin function to call.\n"); + } } - } -} + } // end sample gather +} // end begin_parallel_reduce callback void kokkosp_end_parallel_reduce(const uint64_t kID) { - if (kID > 0) { + pair infoOfMatchedSample = make_tuple(0, 0); + uint32_t devNum; + uint32_t devID; + sampler_mtx.lock(); + if (!(infokIDSample.find(kID) == infokIDSample.end())) { + infoOfMatchedSample = infokIDSample.at(kID); + devID = infoOfMatchedSample.second; + devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - invoke_ktools_fence(0); - } + if (0 < devNum) { + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { + printf( + "KokkosP: sampler's end parallel reduce obtained devNum %lu \n", + (unsigned long)devNum); + } + } else { // device number is negative + if (tool_verbosity > 0) { + printf( + "KokkosP: warning: calling tool induced fence on all devices " + "from sampler end_parallel_reduce. " + "obtained negative device number, i.e., %lu \n", + (unsigned long)devNum); + } + invoke_ktools_fence(0); + } + } // end tool invoked fence if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } if (NULL != endReduceCallee) { - (*endReduceCallee)(kID); + uint64_t nestedkID = infoOfMatchedSample.second; + if (0 > nestedkID) { + printf( + "KokkosP: sampler's child's kID (nested kID) is %llu - less than 0 " + "- and hence is corrupted!\n", + (unsigned long long)nestedkID); + } + (*endReduceCallee)(nestedkID); + infokIDSample.erase(kID); + } else { + if (tool_verbosity > 1) { + printf( + "KokkosP: Note: end_parallel_reduce sampler " + "callback has no child-end function to call.\n"); + } } } // end kID sample -} - -void kokkosp_begin_fence(const char* name, const uint32_t devID, - uint64_t* kID) { - *kID = uniqID++; - static uint64_t invocationNum; - ++invocationNum; - if ((invocationNum % kernelSampleSkip) == 0) { - get_global_fence_choice(); // re-read environment variable to get most - // accurate value - if (0 < tool_globFence) { - invoke_ktools_fence( - 0); // invoke tool-induced fence from device 0 for now - } - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-begin function...\n", - (unsigned long long)(*kID)); - } - - if (NULL != beginFenceCallee) { - (*beginFenceCallee)(name, devID, kID); - } - } -} - -void kokkosp_end_fence(const uint64_t kID) { - if (kID == uniqID) { - get_global_fence_choice(); // re-read environment variable to get most - // accurate value - if (0 < tool_globFence) { - invoke_ktools_fence( - 0); // invoke tool-induced fence from device 0 for now - } - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-end function...\n", - (unsigned long long)(kID)); - } - if (NULL != endFenceCallee) { - (*endFenceCallee)(kID); - } - } -} - -void kokkosp_begin_deep_copy(const char* name, const uint32_t devID, - uint64_t* kID) { - *kID = uniqID++; - static uint64_t invocationNum; - ++invocationNum; - if ((invocationNum % kernelSampleSkip) == 0) { - get_global_fence_choice(); // re-read environment variable to get most - // accurate value - if (0 < tool_globFence) { - invoke_ktools_fence( - 0); // invoke tool-induced fence from device 0 for now - } - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-begin function...\n", - (unsigned long long)(*kID)); - } - - if (NULL != beginDeepCopyCallee) { - (*beginDeepCopyCallee)(name, devID, kID); - } - } -} - -void kokkosp_end_deep_copy(const uint64_t kID) { - if (kID == uniqID) { // check that we match the begin deep copy kernel call - get_global_fence_choice(); // re-read environment variable to get most - // accurate value - if (0 < tool_globFence) { - invoke_ktools_fence( - 0); // invoke tool-induced fence from device 0 for now - } - if (tool_verbosity > 0) { - printf("KokkosP: sample %llu calling child-end function...\n", - (unsigned long long)(kID)); - } - if (NULL != endDeepCopyCallee) { - (*endDeepCopyCallee)(kID); - } - } -} + sampler_mtx.unlock(); +} // end end_parallel_reduce sampler callback } // namespace Sampler } // end namespace KokkosTools @@ -428,9 +533,5 @@ EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan) EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan) EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce) EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) -EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy) -EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy) -EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence) -EXPOSE_END_FENCE(impl::kokkosp_end_fence) } // end extern "C" From ffc4fd8338dcafb4cbb723da7efe3b81d80b54b6 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Aug 2023 14:50:07 -0700 Subject: [PATCH 25/29] taking out tuple initialization of pair to get rid of CI warning as error --- common/kokkos-sampler/kp_sampler_skip.cpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 4aa470840..a30f72073 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -15,10 +15,7 @@ namespace KokkosTools { namespace Sampler { static atomic uniqID = 0; static mutex sampler_mtx; -// static tuple tupleofkID; -// static pair nestedkIDdevID; static unordered_map> infokIDSample; -// static unordered_map nestedkIDofkID; static uint64_t kernelSampleSkip = 101; static int tool_verbosity = 0; static int tool_globFence = 0; @@ -197,9 +194,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, static uint64_t invocationNum; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { - // accurate value - pair infoOfSample = make_tuple(0, 0); - // infoOfSample = new pair(0, 0); + pair infoOfSample; uint32_t devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate @@ -218,8 +213,6 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, } } infoOfSample.second = devID; - // devNumofkID.insert(devNum); - if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -302,8 +295,8 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, ++invocationNum; uint32_t devNum; if ((invocationNum % kernelSampleSkip) == 0) { - pair infoOfSample = make_tuple(0, 0); - devNum = getDeviceID(devID); + pair infoOfSample; + devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { @@ -414,7 +407,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, *kID = uniqID++; static uint64_t invocationNum; ++invocationNum; - pair infoOfSample = make_tuple(0, 0); + pair infoOfSample; uint32_t devNum; if ((invocationNum % kernelSampleSkip) == 0) { devNum = getDeviceID(devID); @@ -461,7 +454,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, } // end begin_parallel_reduce callback void kokkosp_end_parallel_reduce(const uint64_t kID) { - pair infoOfMatchedSample = make_tuple(0, 0); + pair infoOfMatchedSample; uint32_t devNum; uint32_t devID; sampler_mtx.lock(); From 70cdf6a04040de054488c179a508b9fdaa628f3c Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:03:06 -0700 Subject: [PATCH 26/29] making valid device number --- common/kokkos-sampler/kp_sampler_skip.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index a30f72073..9d0dd3e33 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -61,6 +61,7 @@ uint32_t getDeviceID(uint32_t devid_in) { (devid_in >> num_instance_bits); } +bool isValidDevNum(uint32_t devNum) { return true; } void kokkosp_provide_tool_programming_interface( uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { if (!num_funcs) { @@ -199,7 +200,9 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, get_global_fence_choice(); // re-read environment variable to get most // accurate if (0 < tool_globFence) { - if (0 <= devNum) { + if (isValidDevNum( + devNum)) { // check for valid device Num (setting to 1 for now to + // figure out what a valid number is) invoke_ktools_fence( devNum); // invoke tool-induced fence from device number } else { // device number is negative @@ -249,7 +252,7 @@ void kokkosp_end_parallel_for(const uint64_t kID) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - if (0 <= devNum) { + if (isValidDevNum(devNum)) { invoke_ktools_fence( devNum); // invoke tool-induced fence from device number } else { // device number is negative @@ -303,7 +306,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a fence on the device of the devID passed - if (0 <= devNum) { + if (isValidDevNum(devNum)) { invoke_ktools_fence(devNum); if (tool_verbosity > 1) { printf( @@ -356,7 +359,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - if (devNum < 0) { + if (!isValidDevNum(devNum)) { printf( "KokkosP: Warning: global tool-induced fence for end_parallel_scan " "invoked." @@ -413,7 +416,7 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value - if (0 < devNum) { + if (isValidDevNum(devNum)) { if (0 < tool_globFence) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function @@ -465,7 +468,7 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - if (0 < devNum) { + if (isValidDevNum(devNum)) { invoke_ktools_fence(devNum); if (tool_verbosity > 1) { printf( From cf343f58ed66c6f83b21df0c6bdf71969ef21aac Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:31:20 -0700 Subject: [PATCH 27/29] removing valid nestedkID and valid devID for CI to pass --- common/kokkos-sampler/kp_sampler_skip.cpp | 161 +++++++--------------- 1 file changed, 53 insertions(+), 108 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 9d0dd3e33..5cb889e69 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -61,6 +61,7 @@ uint32_t getDeviceID(uint32_t devid_in) { (devid_in >> num_instance_bits); } +bool isValidNestkID(uint64_t nestkID) { return true; } bool isValidDevNum(uint32_t devNum) { return true; } void kokkosp_provide_tool_programming_interface( uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { @@ -200,20 +201,14 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, get_global_fence_choice(); // re-read environment variable to get most // accurate if (0 < tool_globFence) { - if (isValidDevNum( - devNum)) { // check for valid device Num (setting to 1 for now to - // figure out what a valid number is) - invoke_ktools_fence( - devNum); // invoke tool-induced fence from device number - } else { // device number is negative - if (tool_verbosity > 0) - printf( - "KokkosP: warning: device number obtained (%lu) from " - "sampler is negative. Tool-induced " - " fence called with argument 0. \n", - (unsigned long)devNum); - invoke_ktools_fence(0); - } + invoke_ktools_fence( + devNum); // invoke tool-induced fence from device number + // device number is negative + if (tool_verbosity > 0) + printf( + "KokkosP:device number obtained (%lu) from " + "sampler. \n", + (unsigned long)devNum); } infoOfSample.second = devID; if (tool_verbosity > 0) { @@ -252,34 +247,30 @@ void kokkosp_end_parallel_for(const uint64_t kID) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - if (isValidDevNum(devNum)) { - invoke_ktools_fence( - devNum); // invoke tool-induced fence from device number - } else { // device number is negative - if (tool_verbosity > 0) { - printf( - "KokkosP: Warning: device number of sample's kernel ID is %lu " - "and is corrupted! Invoking global fence. \n", - (unsigned long)devNum); - } - invoke_ktools_fence(0); + invoke_ktools_fence( + devNum); // invoke tool-induced fence from device number + // make sure device number is not negative + if (tool_verbosity > 0) { + printf( + "KokkosP: device number of sample's kernel ID is %lu " + " \n", + (unsigned long)devNum); } } + if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", (unsigned long long)(kID)); } if (NULL != endForCallee) { - // (*endForCallee)(kID); uint64_t nestedkID = infoOfMatchedSample.first; - if (0 > nestedkID) { + if (tool_verbosity > 0) { printf( - "KokkosP: Warning: not calling endForCallee. sampler's child's kID " + "KokkosP: sampler's child's kID " "(nested " - " kID) is %llu - less than 0 - and hence is corrupted!\n", + " kID) is %llu. \n", (unsigned long long)nestedkID); - } else { (*endForCallee)(nestedkID); } infokIDSample.erase(kID); @@ -306,24 +297,16 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a fence on the device of the devID passed - if (isValidDevNum(devNum)) { - invoke_ktools_fence(devNum); - if (tool_verbosity > 1) { - printf( - "KokkosP: sampler begin_parallel_scan callback" - " invoked fence on device number %lu\n", - (unsigned long)devNum); - } - } else { // device obtained was negative - if (tool_verbosity > 0) - printf( - "KokkosP: warning: device number obtained was negative, " - "specifically %lu." - " Calling global tool induced fence.\n", - (unsigned long)devNum); - invoke_ktools_fence(0); + // + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { + printf( + "KokkosP: sampler begin_parallel_scan callback" + " invoked fence on device number %lu\n", + (unsigned long)devNum); } } + infoOfSample.second = devID; if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", @@ -353,33 +336,21 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { .end())) { // check that we match the begin scan kernel call infoOfMatchedSample = infokIDSample.at(kID); uint32_t devNum; - // devNum = devNumofkID.find(kID); uint32_t devID = infoOfMatchedSample.second; devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - if (!isValidDevNum(devNum)) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a fence on the device fenced on the begin parallel scan. + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { printf( - "KokkosP: Warning: global tool-induced fence for end_parallel_scan " - "invoked." - " Retrieved device ID of sampler" - " is %lu and hence is corrupted!\n", + "KokkosP: sampler end_parallel_scan callback" + " invoked tool-induced fence on device %lu\n", (unsigned long)devNum); - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a fence on the device fenced on the begin parallel scan. - invoke_ktools_fence(0); - } else { // device is non-negative - invoke_ktools_fence(devNum); - if (tool_verbosity > 1) { - printf( - "KokkosP: sampler end_parallel_scan callback" - " invoked tool-induced fence on device %lu\n", - (unsigned long)devNum); - } } - } // end invoke fence conditional if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-end function...\n", @@ -387,13 +358,7 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { } if (NULL != endScanCallee) { uint64_t nestedkID = infoOfMatchedSample.first; - if (nestedkID < 0) { - printf( - "KokkosP: Warning: sampler's child's kID (nested kID) is %llu - " - "less than 0 - and hence is corrupted!\n", - (unsigned long long)nestedkID); - } else - (*endScanCallee)(nestedkID); + (*endScanCallee)(nestedkID); infokIDSample.erase(kID); } else { if (tool_verbosity > 1) @@ -416,25 +381,19 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value - if (isValidDevNum(devNum)) { - if (0 < tool_globFence) { - // using tool-induced fence from Kokkos_profiling rather than - // Kokkos_C_Profiling_interface. Note that this function - // only invokes a fence on devNum of devID specified - invoke_ktools_fence(devNum); - } - } else { // device number is negative - if (tool_verbosity > 0) { + if (0 < tool_globFence) { + // using tool-induced fence from Kokkos_profiling rather than + // Kokkos_C_Profiling_interface. Note that this function + // only invokes a fence on devNum of devID specified + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { printf( - "KokkosP: warning: sampler begin_parallel_reduce obtained negative " - "dev number, i.e., %d \n", - devNum); + "KokkosP: sampler begin_parallel_reduce obtained " + "dev number, i.e., %lu \n", + (unsigned long)devNum); } - invoke_ktools_fence(0); } - infoOfSample.second = devID; - if (tool_verbosity > 0) { printf("KokkosP: sample %llu calling child-begin function...\n", (unsigned long long)(*kID)); @@ -468,22 +427,10 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - if (isValidDevNum(devNum)) { - invoke_ktools_fence(devNum); - if (tool_verbosity > 1) { - printf( - "KokkosP: sampler's end parallel reduce obtained devNum %lu \n", - (unsigned long)devNum); - } - } else { // device number is negative - if (tool_verbosity > 0) { - printf( - "KokkosP: warning: calling tool induced fence on all devices " - "from sampler end_parallel_reduce. " - "obtained negative device number, i.e., %lu \n", - (unsigned long)devNum); - } - invoke_ktools_fence(0); + invoke_ktools_fence(devNum); + if (tool_verbosity > 1) { + printf("KokkosP: sampler's end parallel reduce obtained devNum %lu \n", + (unsigned long)devNum); } } // end tool invoked fence if (tool_verbosity > 0) { @@ -492,11 +439,9 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { } if (NULL != endReduceCallee) { uint64_t nestedkID = infoOfMatchedSample.second; - if (0 > nestedkID) { - printf( - "KokkosP: sampler's child's kID (nested kID) is %llu - less than 0 " - "- and hence is corrupted!\n", - (unsigned long long)nestedkID); + if (tool_verbosity > 0) { + printf("KokkosP: sampler's endReduceCallee kID (nested kID) is %llu \n", + (unsigned long long)nestedkID); } (*endReduceCallee)(nestedkID); infokIDSample.erase(kID); From 98b4caff3dc6038bc7a12c7dca075ccb374f29ff Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Aug 2023 15:39:53 -0700 Subject: [PATCH 28/29] removing valid nestkID function to pass CI without the warnings as errors. --- common/kokkos-sampler/kp_sampler_skip.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 5cb889e69..98e28b35a 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -61,8 +61,6 @@ uint32_t getDeviceID(uint32_t devid_in) { (devid_in >> num_instance_bits); } -bool isValidNestkID(uint64_t nestkID) { return true; } -bool isValidDevNum(uint32_t devNum) { return true; } void kokkosp_provide_tool_programming_interface( uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { if (!num_funcs) { @@ -201,9 +199,8 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, get_global_fence_choice(); // re-read environment variable to get most // accurate if (0 < tool_globFence) { - invoke_ktools_fence( - devNum); // invoke tool-induced fence from device number - // device number is negative + invoke_ktools_fence(devNum); // invoke tool-induced fence from device + // number device number is negative if (tool_verbosity > 0) printf( "KokkosP:device number obtained (%lu) from " From 43b983d76f786ed356cb01749211c877be4f08ad Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:51:03 -0700 Subject: [PATCH 29/29] Removing unneeded checks of no kID found in kokkosp_end_* --- common/kokkos-sampler/kp_sampler_skip.cpp | 89 +++++++++++------------ 1 file changed, 42 insertions(+), 47 deletions(-) diff --git a/common/kokkos-sampler/kp_sampler_skip.cpp b/common/kokkos-sampler/kp_sampler_skip.cpp index 98e28b35a..e88441bcd 100644 --- a/common/kokkos-sampler/kp_sampler_skip.cpp +++ b/common/kokkos-sampler/kp_sampler_skip.cpp @@ -9,8 +9,6 @@ #include #include -using namespace std; - namespace KokkosTools { namespace Sampler { static atomic uniqID = 0; @@ -44,23 +42,27 @@ void get_global_fence_choice() { // set of functions from Kokkos ToolProgrammingInterface (includes fence) Kokkos::Tools::Experimental::ToolProgrammingInterface tpi_funcs; +uint32_t getDeviceID(uint32_t devid_in) { + int num_device_bits = 7; + int num_instance_bits = 17; + + return (~((uint32_t(-1)) << num_device_bits)) & + (devid_in >> num_instance_bits); +} + void invoke_ktools_fence(uint32_t devID) { if (tpi_funcs.fence != nullptr) { tpi_funcs.fence(devID); + if (tool_verbosity > 1) { + printf("KokkosP: Sampler utility sucessfully invoked " + " tool-induced fence on device %d\n", getDeviceID(devID)); + } } else printf( "KokkosP: FATAL: Kokkos Tools Programming Interface's tool-invoked " "Fence is NULL!\n"); } -uint32_t getDeviceID(uint32_t devid_in) { - int num_device_bits = 7; - int num_instance_bits = 17; - - return (~((uint32_t(-1)) << num_device_bits)) & - (devid_in >> num_instance_bits); -} - void kokkosp_provide_tool_programming_interface( uint32_t num_funcs, Kokkos_Tools_ToolProgrammingInterface* funcsFromTPI) { if (!num_funcs) { @@ -72,12 +74,12 @@ void kokkosp_provide_tool_programming_interface( tpi_funcs = *funcsFromTPI; } -void kokkosp_request_tool_settings(const uint32_t, +void kokkosp_request_tool_settings(const uint32_t , Kokkos_Tools_ToolSettings* settings) { - if (0 == tool_globFence) { - settings->requires_global_fencing = false; - } else { + if (0 < tool_globFence) { settings->requires_global_fencing = true; + } else { + settings->requires_global_fencing = false; } } @@ -91,7 +93,7 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, tool_verbosity = 0; } if (NULL != tool_globFence_str) { - tool_globFence = atoi(tool_globFence_str); + tool_globFence = (atoi(tool_globFence_str)) ; } else { tool_globFence = 0; } @@ -191,7 +193,7 @@ void kokkosp_finalize_library() { void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; - static uint64_t invocationNum; + static uint64_t invocationNum = 0; ++invocationNum; if ((invocationNum % kernelSampleSkip) == 0) { pair infoOfSample; @@ -199,11 +201,11 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, get_global_fence_choice(); // re-read environment variable to get most // accurate if (0 < tool_globFence) { - invoke_ktools_fence(devNum); // invoke tool-induced fence from device + invoke_ktools_fence(devID); // invoke tool-induced fence from device // number device number is negative if (tool_verbosity > 0) printf( - "KokkosP:device number obtained (%lu) from " + "KokkosP: kokkosp_begin_parallel_for(): device number obtained %lu from " "sampler. \n", (unsigned long)devNum); } @@ -213,10 +215,10 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginForCallee) { - uint64_t* nestedkID = 0; - (*beginForCallee)(name, devID, nestedkID); // replace kID with nestedkID + uint64_t nestedkID = 0; + (*beginForCallee)(name, devID, &nestedkID); // replace kID with nestedkID sampler_mtx.lock(); - infoOfSample.first = *nestedkID; + infoOfSample.first = nestedkID; infokIDSample.insert({*kID, infoOfSample}); sampler_mtx.unlock(); } else { // no child to call @@ -233,9 +235,7 @@ void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, void kokkosp_end_parallel_for(const uint64_t kID) { pair infoOfMatchedSample; sampler_mtx.lock(); - if (!(infokIDSample.find(kID) == - infokIDSample.end())) { // check that we match the begin parallel for - // kernel call + infoOfMatchedSample = infokIDSample.at(kID); uint32_t devNum; uint32_t devID; @@ -243,13 +243,13 @@ void kokkosp_end_parallel_for(const uint64_t kID) { devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value - if (0 < tool_globFence) { + if (tool_globFence) { invoke_ktools_fence( - devNum); // invoke tool-induced fence from device number + devID); // invoke tool-induced fence from device number // make sure device number is not negative if (tool_verbosity > 0) { printf( - "KokkosP: device number of sample's kernel ID is %lu " + "KokkosP: kokkosp_end_parallel_for: device number of sample's kernel ID is %lu " " \n", (unsigned long)devNum); } @@ -275,14 +275,13 @@ void kokkosp_end_parallel_for(const uint64_t kID) { if (tool_verbosity > 1) printf("KokkosP: Warning: sampler's endForCallee not found.\n"); } - } // end sampler gather for end_parallel_for sampler_mtx.unlock(); } void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) { *kID = uniqID++; // set memory location value of kID to uniqID - static uint64_t invocationNum; + static uint64_t invocationNum =0 ; ++invocationNum; uint32_t devNum; if ((invocationNum % kernelSampleSkip) == 0) { @@ -295,7 +294,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, // Kokkos_C_Profiling_interface. Note that this function // only invokes a fence on the device of the devID passed // - invoke_ktools_fence(devNum); + invoke_ktools_fence(devID); if (tool_verbosity > 1) { printf( "KokkosP: sampler begin_parallel_scan callback" @@ -310,10 +309,10 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginScanCallee) { - uint64_t* nestedkID = 0; - (*beginScanCallee)(name, devID, nestedkID); + uint64_t nestedkID = 0; + (*beginScanCallee)(name, devID, &nestedkID); sampler_mtx.lock(); - infoOfSample.first = *nestedkID; + infoOfSample.first = nestedkID; infokIDSample.insert({*kID, infoOfSample}); sampler_mtx.unlock(); } else { @@ -328,9 +327,7 @@ void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, void kokkosp_end_parallel_scan(const uint64_t kID) { pair infoOfMatchedSample; sampler_mtx.lock(); - if ((infokIDSample.find(kID) == - infokIDSample - .end())) { // check that we match the begin scan kernel call + infoOfMatchedSample = infokIDSample.at(kID); uint32_t devNum; uint32_t devID = infoOfMatchedSample.second; @@ -341,12 +338,12 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a fence on the device fenced on the begin parallel scan. - invoke_ktools_fence(devNum); + invoke_ktools_fence(devID); if (tool_verbosity > 1) { printf( "KokkosP: sampler end_parallel_scan callback" " invoked tool-induced fence on device %lu\n", - (unsigned long)devNum); + (unsigned long) devNum); } } // end invoke fence conditional if (tool_verbosity > 0) { @@ -363,7 +360,6 @@ void kokkosp_end_parallel_scan(const uint64_t kID) { "KokkosP: warning: sampler's end parallel scan has no " "endScanCallee to call\n"); } - } // end kID sample sampler_mtx.unlock(); } // end end_parallel_scan callback @@ -382,11 +378,11 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, // using tool-induced fence from Kokkos_profiling rather than // Kokkos_C_Profiling_interface. Note that this function // only invokes a fence on devNum of devID specified - invoke_ktools_fence(devNum); + invoke_ktools_fence(devID); if (tool_verbosity > 1) { printf( "KokkosP: sampler begin_parallel_reduce obtained " - "dev number, i.e., %lu \n", + "device number %lu \n", (unsigned long)devNum); } } @@ -396,9 +392,9 @@ void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, (unsigned long long)(*kID)); } if (NULL != beginReduceCallee) { - uint64_t* nestedkID = 0; - (*beginReduceCallee)(name, devID, nestedkID); - infoOfSample.first = *nestedkID; + uint64_t nestedkID = 0; + (*beginReduceCallee)(name, devID, &nestedkID); + infoOfSample.first = nestedkID; sampler_mtx.lock(); infokIDSample.insert({*kID, infoOfSample}); sampler_mtx.unlock(); @@ -417,16 +413,15 @@ void kokkosp_end_parallel_reduce(const uint64_t kID) { uint32_t devNum; uint32_t devID; sampler_mtx.lock(); - if (!(infokIDSample.find(kID) == infokIDSample.end())) { infoOfMatchedSample = infokIDSample.at(kID); devID = infoOfMatchedSample.second; devNum = getDeviceID(devID); get_global_fence_choice(); // re-read environment variable to get most // accurate value if (0 < tool_globFence) { - invoke_ktools_fence(devNum); + invoke_ktools_fence(devID); if (tool_verbosity > 1) { - printf("KokkosP: sampler's end parallel reduce obtained devNum %lu \n", + printf("KokkosP: sampler's end parallel reduce obtained device number %lu \n", (unsigned long)devNum); } } // end tool invoked fence