From aeb346b6d645d8734c81102ead832f5f06332cd5 Mon Sep 17 00:00:00 2001 From: Ben Richard <143630488+benrichard-amd@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:19:06 -0500 Subject: [PATCH 1/2] Fix MPI test failures (#322) The CI test machines only have 2 MPI slots. MPI tests were failing when requesting 4 CPUs. Update these tests to request 2 CPUs. --- tests/omnitrace-mpi-tests.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/omnitrace-mpi-tests.cmake b/tests/omnitrace-mpi-tests.cmake index 3007c6279..3a2d5427e 100644 --- a/tests/omnitrace-mpi-tests.cmake +++ b/tests/omnitrace-mpi-tests.cmake @@ -13,7 +13,7 @@ omnitrace_add_test( NAME "mpi" TARGET mpi-example MPI ON - NUM_PROCS 4 + NUM_PROCS 2 REWRITE_ARGS -e -v @@ -37,7 +37,7 @@ omnitrace_add_test( NAME "mpi-flat-mpip" TARGET mpi-example MPI ON - NUM_PROCS 4 + NUM_PROCS 2 LABELS "mpip" REWRITE_ARGS -e @@ -60,7 +60,7 @@ omnitrace_add_test( NAME "mpi-flat" TARGET mpi-example MPI ON - NUM_PROCS 4 + NUM_PROCS 2 LABELS "mpip" REWRITE_ARGS -e From 7bc50f5a0ab976fd520d6cb3e6ceffd22e3e448d Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Wed, 10 Jan 2024 05:02:22 -0600 Subject: [PATCH 2/2] Roctracer flush activity fix + perfetto.cfg (#317) * Fix roctracer_flush_activity - invoke roctracer_flush_activity() before disabling domains * create comp::roctracer::flush() - real issue was the global state when roctracer_flush_activity() was called * formatting * Update lib/omnitrace/library/components/roctracer.hpp - provide definition of comp::roctracer::flush when OMNITRACE_USE_ROCTRACER is not defined * omnitrace.cfg -> perfetto.cfg - rename provided perfetto config file (omnitrace.cfg) to perfetto.cfg to avoid confusion * Update lib/core - gpu.hpp: defines for OMNITRACE_USE_{HIP,ROCTRACER,ROCPROFILER,ROCM_SMI} - gpu.cpp - include core/hip_runtime.hpp - fix serialization of hipDeviceProp_t - add hip_runtime.hpp - ensure proper inclusion of hip_runtime.h - add rccl.hpp - ensure proper inclusion of rccl.h * Update lib/omnitrace/library - rcclp.cpp - update includes for rccl - roctracer.hpp - update includes for hip_runtime - components/comm_data.hpp - update includes for rccl - components/rcclp.hpp - update includes for rccl * Update bin/omnitrace-avail/avail.cpp - update includes for hip_runtime * Update examples/rccl/CMakeLists.txt - fix find_package for rccl when CI enabled * Update CMakeLists.txt - set cmake policy CMP0135 to NEW for cmake >= 3.24 - Enable DOWNLOAD_EXTRACT_TIMESTAMP with ExternalProject_Add + URL download method * Update timemory submodule * Update pybind11 submodule * Update pybind11 submodule * Update lib/core/rccl.hpp - include rccl.h only if OMNITRACE_USE_RCCL > 0 * Update lib/core/{gpu,hip_runtime}.hpp * Update lib/core/gpu.cpp - reintroduce some ppdefs * Update lib/core/gpu.cpp - fix ifdef on OMNITRACE_HIP_VERSION * Update lib/core/gpu.cpp - fix static assert for OMNITRACE_HIP_VERSION_MINOR when HIP version 4.x or older (unreliable minor versions) * Update lib/core/gpu.cpp - fix ifdef on OMNITRACE_HIP_VERSION * Update lib/core/config.cpp - disable OMNITRACE_PERFETTO_COMBINE_TRACES by default * Update lib/core/perfetto.cpp - if unable to open perfetto temp file, return the ReadTraceBlocking() * Update lib/core/config.* - flush tmpfile before closing --- CMakeLists.txt | 10 +- README.md | 2 +- examples/rccl/CMakeLists.txt | 5 +- external/pybind11 | 2 +- external/timemory | 2 +- omnitrace.cfg => perfetto.cfg | 4 +- source/bin/omnitrace-avail/avail.cpp | 7 +- source/docs/runtime.md | 2 +- source/lib/core/CMakeLists.txt | 2 + source/lib/core/config.cpp | 29 ++- source/lib/core/config.hpp | 1 + source/lib/core/gpu.cpp | 245 ++++++++++++++++-- source/lib/core/hip_runtime.hpp | 56 ++++ source/lib/core/perfetto.cpp | 9 + source/lib/core/rccl.hpp | 35 +++ source/lib/omnitrace/library.cpp | 7 + .../library/components/comm_data.hpp | 9 +- .../omnitrace/library/components/rcclp.hpp | 7 +- .../library/components/roctracer.cpp | 56 ++-- .../library/components/roctracer.hpp | 5 + source/lib/omnitrace/library/rcclp.cpp | 7 +- source/lib/omnitrace/library/roctracer.cpp | 2 - source/lib/omnitrace/library/roctracer.hpp | 1 + tests/omnitrace-rocm-tests.cmake | 12 + 24 files changed, 441 insertions(+), 76 deletions(-) rename omnitrace.cfg => perfetto.cfg (68%) create mode 100644 source/lib/core/hip_runtime.hpp create mode 100644 source/lib/core/rccl.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 12e3c6756..bb2814215 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Build position independent code") +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) + cmake_policy(SET CMP0135 NEW) +endif() + if("${CMAKE_BUILD_TYPE}" STREQUAL "") set(CMAKE_BUILD_TYPE Release @@ -366,8 +370,8 @@ if(NOT OMNITRACE_USE_ROCPROFILER) endif() configure_file( - ${PROJECT_SOURCE_DIR}/omnitrace.cfg - ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/omnitrace.cfg + ${PROJECT_SOURCE_DIR}/perfetto.cfg + ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/perfetto.cfg COPYONLY) configure_file( @@ -381,7 +385,7 @@ configure_file( install( FILES ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/setup-env.sh - ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/omnitrace.cfg + ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/perfetto.cfg DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME} COMPONENT setup) diff --git a/README.md b/README.md index 8437fe50f..451236a35 100755 --- a/README.md +++ b/README.md @@ -325,7 +325,7 @@ Enable `traced` and `perfetto` in the background: ```shell pkill traced traced --background -perfetto --out ./omnitrace-perfetto.proto --txt -c ${OMNITRACE_ROOT}/share/omnitrace.cfg --background +perfetto --out ./omnitrace-perfetto.proto --txt -c ${OMNITRACE_ROOT}/share/perfetto.cfg --background ``` > ***NOTE: if the perfetto tools were installed by omnitrace, replace `traced` with `omnitrace-perfetto-traced` and*** diff --git a/examples/rccl/CMakeLists.txt b/examples/rccl/CMakeLists.txt index 3142ebd2f..4b0d860d7 100644 --- a/examples/rccl/CMakeLists.txt +++ b/examples/rccl/CMakeLists.txt @@ -31,7 +31,10 @@ if(NOT hip_FOUND) return() endif() -if("${CMAKE_PROJECT_NAME}" STREQUAL "omnitrace" AND "$ENV{OMNITRACE_CI}") +if("${CMAKE_PROJECT_NAME}" STREQUAL "omnitrace" + AND ("$ENV{OMNITRACE_CI}" + OR OMNITRACE_CI + OR OMNITRACE_BUILD_CI)) find_package(rccl QUIET) # avoid generating warning in CI else() find_package(rccl) diff --git a/external/pybind11 b/external/pybind11 index ad0de0f5a..1a917f185 160000 --- a/external/pybind11 +++ b/external/pybind11 @@ -1 +1 @@ -Subproject commit ad0de0f5a6bebbebbeb7f8f2f15c0c1430f34268 +Subproject commit 1a917f1852eb7819b671fc3fa862840f4c491a07 diff --git a/external/timemory b/external/timemory index ace5bc4dc..2a1bcba0c 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit ace5bc4dc9134e05c818330b137170692381f6be +Subproject commit 2a1bcba0cad46efd4421c0c7a145e83b161fb934 diff --git a/omnitrace.cfg b/perfetto.cfg similarity index 68% rename from omnitrace.cfg rename to perfetto.cfg index 1c1d8304f..146b28d9f 100644 --- a/omnitrace.cfg +++ b/perfetto.cfg @@ -1,6 +1,4 @@ -# perfetto --out OUTPUT_FILE --txt -c omnitrace.cfg -# 5 minute trace, but can be stopped prematurely. -duration_ms: 300000 +# perfetto --out OUTPUT_FILE --txt -c perfetto.cfg write_into_file: true # One buffer allocated within the central tracing binary for the entire trace, diff --git a/source/bin/omnitrace-avail/avail.cpp b/source/bin/omnitrace-avail/avail.cpp index 8af993537..e4e474b1f 100644 --- a/source/bin/omnitrace-avail/avail.cpp +++ b/source/bin/omnitrace-avail/avail.cpp @@ -33,6 +33,7 @@ #include "api.hpp" #include "core/config.hpp" #include "core/gpu.hpp" +#include "core/hip_runtime.hpp" #include "library/rocprofiler.hpp" #include @@ -62,12 +63,6 @@ #include #include -#if defined(OMNITRACE_USE_HIP) && OMNITRACE_USE_HIP > 0 -# include -#elif !defined(OMNITRACE_USE_HIP) -# define OMNITRACE_USE_HIP 0 -#endif - #if defined(TIMEMORY_UNIX) # include // ioctl() and TIOCGWINSZ # include // for STDOUT_FILENO diff --git a/source/docs/runtime.md b/source/docs/runtime.md index 7586a52b7..1e12db59a 100644 --- a/source/docs/runtime.md +++ b/source/docs/runtime.md @@ -205,7 +205,7 @@ OMNITRACE_KOKKOSP_KERNEL_LOGGER = false OMNITRACE_PAPI_EVENTS = PAPI_TOT_CYC OMNITRACE_PERFETTO_BACKEND = inprocess OMNITRACE_PERFETTO_BUFFER_SIZE_KB = 1024000 -OMNITRACE_PERFETTO_COMBINE_TRACES = true +OMNITRACE_PERFETTO_COMBINE_TRACES = false OMNITRACE_PERFETTO_FILE = perfetto-trace.proto OMNITRACE_PERFETTO_FILL_POLICY = discard OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB = 4096 diff --git a/source/lib/core/CMakeLists.txt b/source/lib/core/CMakeLists.txt index b18973c60..bef6228c1 100644 --- a/source/lib/core/CMakeLists.txt +++ b/source/lib/core/CMakeLists.txt @@ -29,10 +29,12 @@ set(core_headers ${CMAKE_CURRENT_LIST_DIR}/dynamic_library.hpp ${CMAKE_CURRENT_LIST_DIR}/exception.hpp ${CMAKE_CURRENT_LIST_DIR}/gpu.hpp + ${CMAKE_CURRENT_LIST_DIR}/hip_runtime.hpp ${CMAKE_CURRENT_LIST_DIR}/locking.hpp ${CMAKE_CURRENT_LIST_DIR}/mproc.hpp ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/perfetto.hpp + ${CMAKE_CURRENT_LIST_DIR}/rccl.hpp ${CMAKE_CURRENT_LIST_DIR}/redirect.hpp ${CMAKE_CURRENT_LIST_DIR}/state.hpp ${CMAKE_CURRENT_LIST_DIR}/timemory.hpp diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index 4f83e930e..1e4ff5b6b 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -31,6 +31,7 @@ #include "perfetto.hpp" #include "utility.hpp" +#include #include #include #include @@ -650,8 +651,7 @@ configure_settings(bool _init) OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_PERFETTO_COMBINE_TRACES", "Combine Perfetto traces. If not explicitly set, it will " "default to the value of OMNITRACE_COLLAPSE_PROCESSES", - _config->get("collapse_processes"), "perfetto", "data", - "advanced"); + false, "perfetto", "data", "advanced"); OMNITRACE_CONFIG_SETTING( bool, "OMNITRACE_PERFETTO_ROCTRACER_PER_STREAM", @@ -2527,9 +2527,34 @@ tmp_file::fopen(const char* _mode) return (file != nullptr && fd > 0); } +bool +tmp_file::flush() +{ + if(stream.is_open()) + { + stream.flush(); + } + else if(file != nullptr) + { + int _ret = fflush(file); + int _cnt = 0; + while(_ret == EAGAIN || _ret == EINTR) + { + std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); + _ret = fflush(file); + if(++_cnt > 10) break; + } + return (_ret == 0); + } + + return true; +} + bool tmp_file::close() { + flush(); + if(stream.is_open()) { stream.close(); diff --git a/source/lib/core/config.hpp b/source/lib/core/config.hpp index 5ed92bb2f..1a90d9d87 100644 --- a/source/lib/core/config.hpp +++ b/source/lib/core/config.hpp @@ -394,6 +394,7 @@ struct tmp_file bool open(std::ios::openmode = std::ios::binary | std::ios::in | std::ios::out); bool fopen(const char* = "r+"); + bool flush(); bool close(); bool remove(); diff --git a/source/lib/core/gpu.cpp b/source/lib/core/gpu.cpp index 9b4e486bf..8adc20b57 100644 --- a/source/lib/core/gpu.cpp +++ b/source/lib/core/gpu.cpp @@ -20,6 +20,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. +#include "common/defines.h" + #if !defined(OMNITRACE_USE_ROCM_SMI) # define OMNITRACE_USE_ROCM_SMI 0 #endif @@ -28,15 +30,17 @@ # define OMNITRACE_USE_HIP 0 #endif +#include "core/hip_runtime.hpp" + #if OMNITRACE_USE_HIP > 0 # if !defined(TIMEMORY_USE_HIP) # define TIMEMORY_USE_HIP 1 # endif #endif -#include "gpu.hpp" #include "debug.hpp" #include "defines.hpp" +#include "gpu.hpp" #include @@ -45,10 +49,19 @@ #endif #if OMNITRACE_USE_HIP > 0 -# include -# include # include +static_assert(OMNITRACE_HIP_VERSION_MAJOR == HIP_VERSION_MAJOR, + "OMNITRACE_HIP_VERSION_MAJOR (detected by cmake) != HIP_VERSION_MAJOR " + "(from )"); + +# if OMNITRACE_HIP_VERSION_MAJOR >= 5 +// HIP versions 4.x and older have unreliable values for HIP_VERSION_MINOR +static_assert(OMNITRACE_HIP_VERSION_MINOR == HIP_VERSION_MINOR, + "OMNITRACE_HIP_VERSION_MINOR (detected by cmake) != HIP_VERSION_MINOR " + "(from )"); +# endif + # if !defined(OMNITRACE_HIP_RUNTIME_CALL) # define OMNITRACE_HIP_RUNTIME_CALL(err) \ { \ @@ -107,6 +120,91 @@ rsmi_init() return _rsmi_init; } #endif + +#if OMNITRACE_HIP_VERSION >= 60000 +template ::value, int> = 0> +void +device_prop_serialize(ArchiveT& archive, const char* name, const ArgT& arg) +{ + namespace cereal = tim::cereal; + using cereal::make_nvp; + archive(make_nvp(name, arg)); +} + +template +void +device_prop_serialize(ArchiveT& archive, const char* name, ArgT arg[N]) +{ + if constexpr(!std::is_same::value && + !std::is_same::value) + { + namespace cereal = tim::cereal; + using cereal::make_nvp; + auto data = std::array{}; + for(size_t i = 0; i < N; ++i) + data[i] = arg[i]; + archive(make_nvp(name, data)); + } + else + { + device_prop_serialize(archive, name, std::string{ arg }); + } +} + +template +void +device_prop_serialize(ArchiveT& archive, const char* name, hipUUID_t arg) +{ + constexpr auto N = sizeof(arg.bytes); + namespace cereal = tim::cereal; + using cereal::make_nvp; + auto data = std::array{}; + data.fill('\0'); + for(size_t i = 0; i < N; ++i) + data[i] = arg.bytes[i]; + auto str_v = std::string_view{ data.data() }; + auto str = std::string{ str_v }.substr(0, str_v.find('\0')); + archive(make_nvp(name, str)); +} + +template +void +device_prop_serialize(ArchiveT& archive, const char* name, hipDeviceArch_t arg) +{ + namespace cereal = tim::cereal; + using cereal::make_nvp; + +# define OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(NAME) \ + { \ + auto val = arg.NAME; \ + archive(make_nvp(#NAME, val)); \ + } + + archive.setNextName(name); + archive.startNode(); + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt32Atomics) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalFloatAtomicExch) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt32Atomics) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasSharedFloatAtomicExch) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasFloatAtomicAdd) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt64Atomics) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt64Atomics) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasDoubles) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasWarpVote) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasWarpBallot) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasWarpShuffle) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasFunnelShift) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasThreadFenceSystem) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasSyncThreadsExt) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasSurfaceFuncs) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(has3dGrid) + OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH(hasDynamicParallelism) + archive.finishNode(); + +# undef OMNITRACE_SERIALIZE_HIP_DEVICE_ARCH +} +#endif } // namespace int @@ -161,11 +259,10 @@ template void add_hip_device_metadata(ArchiveT& ar) { -#if OMNITRACE_USE_HIP > 0 namespace cereal = tim::cereal; using cereal::make_nvp; - using intvec_t = std::vector; +#if OMNITRACE_USE_HIP > 0 int _device_count = 0; int _current_device = 0; hipError_t _device_count_err = hipGetDeviceCount(&_device_count); @@ -183,12 +280,6 @@ add_hip_device_metadata(ArchiveT& ar) if(_current_device_err != hipSuccess || _device_count == 0) return; -# define OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(NAME) \ - ar(make_nvp(#NAME, _device_prop.NAME)); - -# define OMNITRACE_SERIALIZE_HIP_DEVICE_PROP_ARRAY(NAME, ...) \ - ar(make_nvp(NAME, __VA_ARGS__)); - ar.setNextName("hip_device_properties"); ar.startNode(); ar.makeArray(); @@ -205,6 +296,16 @@ add_hip_device_metadata(ArchiveT& ar) OMNITRACE_HIP_RUNTIME_CALL(hipRuntimeGetVersion(&_runtime_version)); ar.startNode(); + +# if OMNITRACE_HIP_VERSION < 60000 + using intvec_t = std::vector; + +# define OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(NAME) \ + ar(make_nvp(#NAME, _device_prop.NAME)); + +# define OMNITRACE_SERIALIZE_HIP_DEVICE_PROP_ARRAY(NAME, ...) \ + ar(make_nvp(NAME, __VA_ARGS__)); + ar(make_nvp("name", std::string{ _device_prop.name })); ar(make_nvp("driver_version", _driver_version)); ar(make_nvp("runtime_version", _runtime_version)); @@ -215,11 +316,11 @@ add_hip_device_metadata(ArchiveT& ar) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(totalConstMem) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(clockRate) -# if OMNITRACE_HIP_VERSION >= 5000 +# if OMNITRACE_HIP_VERSION >= 50000 OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize) -# endif +# endif OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock) @@ -247,7 +348,6 @@ add_hip_device_metadata(ArchiveT& ar) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pciBusID) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(computeMode) - OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(computeMode) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(gcnArch) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(gcnArchName) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard) @@ -259,8 +359,118 @@ add_hip_device_metadata(ArchiveT& ar) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor) OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(asicRevision) +# else +# define OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(NAME) \ + device_prop_serialize(ar, #NAME, _device_prop.NAME); + + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(name) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(uuid) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(luid) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(luidDeviceNodeMask) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(warpSize) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memPitch) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxThreadsDim) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxGridSize) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(clockRate) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(totalConstMem) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(major) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(minor) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(textureAlignment) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(texturePitchAlignment) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(deviceOverlap) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(integrated) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(computeMode) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture1D) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DMipmap) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLinear) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture2D) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DMipmap) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLinear) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DGather) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture3D) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture3DAlt) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemap) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLayered) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLayered) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemapLayered) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurface1D) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurface2D) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurface3D) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurface1DLayered) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurface2DLayered) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemap) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemapLayered) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(surfaceAlignment) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pciBusID) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pciDomainID) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(tccDriver) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(asyncEngineCount) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(unifiedAddressing) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(persistingL2CacheMaxSize) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(streamPrioritiesSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(globalL1CacheSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(localL1CacheSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerMultiprocessor) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(regsPerMultiprocessor) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(managedMemory) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(multiGpuBoardGroupID) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(hostNativeAtomicSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(singleToDoublePrecisionPerfRatio) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(computePreemptionSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(canUseHostPointerForRegisteredMem) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlockOptin) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxBlocksPerMultiProcessor) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(accessPolicyMaxWindowSize) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(reservedSharedMemPerBlock) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(hostRegisterSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(sparseHipArraySupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(hostRegisterReadOnlySupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(timelineSemaphoreInteropSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memoryPoolsSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMASupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAFlushWritesOptions) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAWritesOrdering) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(memoryPoolSupportedHandleTypes) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(deferredMappingHipArraySupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(ipcEventSupported) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(clusterLaunch) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(unifiedFunctionPointers) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(gcnArchName) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(arch) + // OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(hdpMemFlushCntl) + // OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(hdpRegFlushCntl) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedFunc) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedGridDim) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedBlockDim) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedSharedMem) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(isLargeBar) + OMNITRACE_SERIALIZE_HIP_DEVICE_PROP(asicRevision) +# endif - const char* _compute_mode_descr[] = { + constexpr auto _compute_mode_descr = std::array{ "Default (multiple host threads can use ::hipSetDevice() with device " "simultaneously)", "Exclusive (only one host thread in one process is able to use " @@ -271,8 +481,11 @@ add_hip_device_metadata(ArchiveT& ar) "Unknown", nullptr }; + + auto _compute_mode = std::min(_device_prop.computeMode, 5); ar(make_nvp("computeModeDescription", - std::string{ _compute_mode_descr[_device_prop.computeMode] })); + std::string{ _compute_mode_descr.at(_compute_mode) })); + ar.finishNode(); } #else diff --git a/source/lib/core/hip_runtime.hpp b/source/lib/core/hip_runtime.hpp new file mode 100644 index 000000000..492af153e --- /dev/null +++ b/source/lib/core/hip_runtime.hpp @@ -0,0 +1,56 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/defines.hpp" + +#if defined(OMNITRACE_USE_HIP) && OMNITRACE_USE_HIP > 0 + +# if defined(HIP_INCLUDE_HIP_HIP_RUNTIME_H) || \ + defined(HIP_INCLUDE_HIP_HIP_RUNTIME_API_H) +# error \ + "include core/hip_runtime.hpp before or " +# endif + +# define HIP_PROF_HIP_API_STRING 1 + +// following must be included before for ROCm 6.0+ +# if OMNITRACE_HIP_VERSION >= 60000 +# if defined(USE_PROF_API) +# undef USE_PROF_API +# endif +# include +# include +// must be included after hip_runtime_api.h +# include +// must be included after hip_runtime_api.h +# include +// must be included after hip_runtime_api.h +# include +# else +# include +# include +# endif + +# include +#endif diff --git a/source/lib/core/perfetto.cpp b/source/lib/core/perfetto.cpp index ef7fa0115..a4725aea8 100644 --- a/source/lib/core/perfetto.cpp +++ b/source/lib/core/perfetto.cpp @@ -174,6 +174,15 @@ post_process(tim::manager* _timemory_manager, bool& _perfetto_output_error) { _tmp_file->close(); FILE* _fdata = fopen(_tmp_file->filename.c_str(), "rb"); + + if(!_fdata) + { + OMNITRACE_VERBOSE( + -1, "Error! perfetto temp trace file '%s' could not be read", + _tmp_file->filename.c_str()); + return char_vec_t{ tracing_session->ReadTraceBlocking() }; + } + fseek(_fdata, 0, SEEK_END); size_t _fnum_elem = ftell(_fdata); fseek(_fdata, 0, SEEK_SET); // same as rewind(f); diff --git a/source/lib/core/rccl.hpp b/source/lib/core/rccl.hpp new file mode 100644 index 000000000..0f97e3d3a --- /dev/null +++ b/source/lib/core/rccl.hpp @@ -0,0 +1,35 @@ +// MIT License +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "core/defines.hpp" +#include "core/hip_runtime.hpp" + +#if defined(OMNITRACE_USE_HIP) && OMNITRACE_USE_HIP > 0 && \ + defined(OMNITRACE_USE_RCCL) && OMNITRACE_USE_RCCL > 0 +# if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 +# include +# else +# include +# endif +#endif diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index 2abaac54f..63b6b2aff 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -726,6 +726,13 @@ omnitrace_finalize_hidden(void) } } + if(get_use_roctracer()) + { + OMNITRACE_VERBOSE_F(1, "Flushing roctracer...\n"); + // ensure that roctracer is flushed before setting the state to finalized + comp::roctracer::flush(); + } + set_state(State::Finalized); push_enable_sampling_on_child_threads(false); diff --git a/source/lib/omnitrace/library/components/comm_data.hpp b/source/lib/omnitrace/library/components/comm_data.hpp index 7609c29d6..0c297bb5d 100644 --- a/source/lib/omnitrace/library/components/comm_data.hpp +++ b/source/lib/omnitrace/library/components/comm_data.hpp @@ -26,6 +26,7 @@ #include "core/common.hpp" #include "core/components/fwd.hpp" #include "core/defines.hpp" +#include "core/rccl.hpp" #include "core/timemory.hpp" #include "library/components/category_region.hpp" @@ -37,14 +38,6 @@ #include -#if defined(OMNITRACE_USE_RCCL) -# if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 -# include -# else -# include -# endif -#endif - #if defined(OMNITRACE_USE_MPI) # include #endif diff --git a/source/lib/omnitrace/library/components/rcclp.hpp b/source/lib/omnitrace/library/components/rcclp.hpp index 2260aafd4..f0d6b0296 100644 --- a/source/lib/omnitrace/library/components/rcclp.hpp +++ b/source/lib/omnitrace/library/components/rcclp.hpp @@ -25,6 +25,7 @@ #include "core/common.hpp" #include "core/components/fwd.hpp" #include "core/defines.hpp" +#include "core/rccl.hpp" #include "core/timemory.hpp" #include "library/components/category_region.hpp" #include "library/components/comm_data.hpp" @@ -32,12 +33,6 @@ #include #include -#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 -# include -#else -# include -#endif - #include #include #include diff --git a/source/lib/omnitrace/library/components/roctracer.cpp b/source/lib/omnitrace/library/components/roctracer.cpp index 7230276eb..748a7b16f 100644 --- a/source/lib/omnitrace/library/components/roctracer.cpp +++ b/source/lib/omnitrace/library/components/roctracer.cpp @@ -32,6 +32,7 @@ #include "library/thread_data.hpp" #include "library/thread_info.hpp" +#include #include #define HIP_PROF_HIP_API_STRING 1 @@ -272,6 +273,41 @@ roctracer::setup(void* table, bool on_load_trace) OMNITRACE_VERBOSE_F(1, "roctracer is setup\n"); } +void +roctracer::flush() +{ + auto wait_for_activity_flush_completion = []() { + uint16_t nitr = 0; + while(roctracer_activity_count() > 0 && nitr++ < 10) + std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); + }; + + // a flush may already be happening + wait_for_activity_flush_completion(); + + if(roctracer_activity_count() == 0) + { + OMNITRACE_VERBOSE_F(2, "executing roctracer_flush_activity()...\n"); + OMNITRACE_ROCTRACER_CALL(roctracer_flush_activity()); + // wait to make sure flush completes + std::this_thread::sleep_for(std::chrono::milliseconds{ 100 }); + wait_for_activity_flush_completion(); + } + else + { + OMNITRACE_CI_FAIL(true, + "roctracer_activity_count() != 0 (== %li). " + "roctracer::shutdown() most likely called during abort", + roctracer_activity_count().load()); + } + + OMNITRACE_VERBOSE_F(2, "executing hip_exec_activity_callbacks(0..%zu)\n", + thread_info::get_peak_num_threads()); + // make sure all async operations are executed + for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) + hip_exec_activity_callbacks(i); +} + void roctracer::shutdown() { @@ -282,16 +318,11 @@ roctracer::shutdown() tim::storage::instance()->reset(); return; } + roctracer_is_setup() = false; OMNITRACE_VERBOSE_F(1, "shutting down roctracer...\n"); - OMNITRACE_VERBOSE_F(2, "executing hip_exec_activity_callbacks(0..%zu)\n", - thread_info::get_peak_num_threads()); - // make sure all async operations are executed - for(size_t i = 0; i < thread_info::get_peak_num_threads(); ++i) - hip_exec_activity_callbacks(i); - // callback for hsa OMNITRACE_VERBOSE_F(2, "executing %zu roctracer_shutdown_routines...\n", roctracer_shutdown_routines().size()); @@ -352,19 +383,6 @@ roctracer::shutdown() roctracer_disable_op_activity(ACTIVITY_DOMAIN_HSA_OPS, HSA_OP_ID_COPY)); } - if(roctracer_activity_count() == 0) - { - OMNITRACE_VERBOSE_F(2, "executing roctracer_flush_activity()...\n"); - OMNITRACE_ROCTRACER_CALL(roctracer_flush_activity()); - } - else - { - OMNITRACE_CI_FAIL(true, - "roctracer_activity_count() != 0 (== %li). " - "roctracer::shutdown() most likely called during abort", - roctracer_activity_count().load()); - } - OMNITRACE_VERBOSE_F(1, "roctracer is shutdown\n"); } diff --git a/source/lib/omnitrace/library/components/roctracer.hpp b/source/lib/omnitrace/library/components/roctracer.hpp index d58711504..5423f0d02 100644 --- a/source/lib/omnitrace/library/components/roctracer.hpp +++ b/source/lib/omnitrace/library/components/roctracer.hpp @@ -58,6 +58,7 @@ struct roctracer static bool is_setup(); static void setup(void* hsa_api_table, bool on_load_trace = false); + static void flush(); static void shutdown(); static void add_setup(const std::string&, std::function&&); static void add_shutdown(const std::string&, std::function&&); @@ -77,6 +78,10 @@ inline void roctracer::setup(void*, bool) {} +inline void +roctracer::flush() +{} + inline void roctracer::shutdown() {} diff --git a/source/lib/omnitrace/library/rcclp.cpp b/source/lib/omnitrace/library/rcclp.cpp index 1751544a7..a362d985a 100644 --- a/source/lib/omnitrace/library/rcclp.cpp +++ b/source/lib/omnitrace/library/rcclp.cpp @@ -26,17 +26,12 @@ #include "core/components/fwd.hpp" #include "core/defines.hpp" #include "core/dynamic_library.hpp" +#include "core/rccl.hpp" #include "core/timemory.hpp" #include "library/components/category_region.hpp" #include -#if OMNITRACE_HIP_VERSION == 0 || OMNITRACE_HIP_VERSION >= 50200 -# include -#else -# include -#endif - #include #include #include diff --git a/source/lib/omnitrace/library/roctracer.cpp b/source/lib/omnitrace/library/roctracer.cpp index 52be9158a..7ce7bf144 100644 --- a/source/lib/omnitrace/library/roctracer.cpp +++ b/source/lib/omnitrace/library/roctracer.cpp @@ -44,8 +44,6 @@ #include #include -#define HIP_PROF_HIP_API_STRING 1 - #include #include #include diff --git a/source/lib/omnitrace/library/roctracer.hpp b/source/lib/omnitrace/library/roctracer.hpp index 187a1776c..5b773389e 100644 --- a/source/lib/omnitrace/library/roctracer.hpp +++ b/source/lib/omnitrace/library/roctracer.hpp @@ -24,6 +24,7 @@ #include "core/config.hpp" #include "core/debug.hpp" +#include "core/hip_runtime.hpp" #include "core/perfetto.hpp" #include "library/components/roctracer.hpp" #include "library/ptl.hpp" diff --git a/tests/omnitrace-rocm-tests.cmake b/tests/omnitrace-rocm-tests.cmake index 1e12a3e53..cf40180ad 100644 --- a/tests/omnitrace-rocm-tests.cmake +++ b/tests/omnitrace-rocm-tests.cmake @@ -28,6 +28,18 @@ omnitrace_add_test( uniform_int_distribution ENVIRONMENT "${_base_environment};OMNITRACE_CRITICAL_TRACE=ON") +omnitrace_add_test( + SKIP_REWRITE SKIP_RUNTIME + NAME transpose-two-kernels + TARGET transpose + MPI OFF + GPU ON + NUM_PROCS 1 + RUN_ARGS 1 2 2 + ENVIRONMENT + "${_base_environment};OMNITRACE_CRITICAL_TRACE=OFF;OMNITRACE_ROCTRACER_HSA_ACTIVITY=OFF;OMNITRACE_ROCTRACER_HSA_API=OFF" + ) + omnitrace_add_test( SKIP_BASELINE SKIP_RUNTIME NAME transpose-loops