diff --git a/CMakeLists.txt b/CMakeLists.txt index 66af3963..cc60f45a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON) option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF) option(GC_ENABLE_RUNTIME_NAIVE_BRGEMM "Use naive BRGEMM as runtime backend for debug purpose." OFF) option(GC_BENCH_ENABLE "Build benchgc." ON) +option(GC_ENABLE_GPU_PROFILE "Enable the GPU kernel profiling." OFF) if(GC_ENABLE_LEGACY) add_subdirectory(legacy/core) diff --git a/README.md b/README.md index 23ddea37..5f5a3ed3 100644 --- a/README.md +++ b/README.md @@ -77,4 +77,5 @@ Graph Compiler supports the following build-time options. | GC_DEV_LINK_LLVM_DYLIB | ON, **OFF** | Controls dynamic link LLVM/MLIR libraries, mainly for developer | | GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF | Controls building the Python API | | GC_ENABLE_IMEX | ON, **OFF** | Whether to enable the GPU components | +| GC_ENABLE_GPU_PROFILE | ON, **OFF** | Whether to enable the GPU profiling which will profile the kernel execution time | diff --git a/cmake/ptigpu.cmake b/cmake/ptigpu.cmake new file mode 100644 index 00000000..762b77ff --- /dev/null +++ b/cmake/ptigpu.cmake @@ -0,0 +1,9 @@ +include_guard() + +FetchContent_Declare( + PTIGPU + GIT_REPOSITORY https://github.com/intel/pti-gpu.git + GIT_TAG exp_opencl_0.11.0 + SOURCE_SUBDIR sdk +) +FetchContent_MakeAvailable(PTIGPU) diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt b/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt index 37cd56b8..2f268cc5 100644 --- a/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt +++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt @@ -8,3 +8,10 @@ gc_add_mlir_library(GcGpuOclRuntime ) target_include_directories(GcGpuOclRuntime PUBLIC ${OpenCL_INCLUDE_DIRS}) target_link_libraries(GcGpuOclRuntime PUBLIC ${OpenCL_LIBRARIES}) + +if(GC_ENABLE_GPU_PROFILE) + include(ptigpu) + find_package(Pti REQUIRED) + target_link_libraries(GcGpuOclRuntime PRIVATE Pti::pti_view) + add_definitions(-DGC_ENABLE_GPU_PROFILE) +endif () diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp index 6798cd41..1c54bc60 100644 --- a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp +++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp @@ -22,6 +22,185 @@ #include "mlir/Interfaces/DataLayoutInterfaces.h" #include "mlir/Pass/PassManager.h" +#ifdef GC_ENABLE_GPU_PROFILE +#include "PtiGpuUtils.h" +#include "pti/pti_view.h" +std::map, std::vector> + external_corr_map; +std::map runtime_enq_2_gpu_kernel_name_map; +std::map runtime_enq_2_gpu_mem_op_name_map; + +class GPUKernelTracer { +public: + GPUKernelTracer() { + gcLogD("Enable Profiling."); + ptiViewSetCallbacks( + [](auto **buf, auto *buf_size) { + *buf_size = sizeof(pti_view_record_kernel) * 100; + void *ptr = ::operator new(*buf_size); + ptr = std::align(8, sizeof(unsigned char), ptr, *buf_size); + *buf = reinterpret_cast(ptr); + if (!*buf) { + std::abort(); + } + return; + }, + [](auto *buf, auto buf_size, auto valid_buf_size) { + if (!buf_size || !valid_buf_size || !buf_size) { + std::cerr << "Received empty buffer" << '\n'; + if (valid_buf_size) { + ::operator delete(buf); + } + return; + } + pti_view_record_base *ptr = nullptr; + while (true) { + auto buf_status = ptiViewGetNextRecord(buf, valid_buf_size, &ptr); + if (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) { + std::cout << "Reached End of buffer" << '\n'; + break; + } + if (buf_status != pti_result::PTI_SUCCESS) { + std::cerr << "Found Error Parsing Records from PTI" << '\n'; + break; + } + switch (ptr->_view_kind) { + case pti_view_kind::PTI_VIEW_INVALID: { + std::cout << "Found Invalid Record" << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + pti_view_record_memory_copy *rec = + reinterpret_cast(ptr); + runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id] = + rec->_name; + std::cout << "Found Memory Record" << '\n'; + samples_utils::dump_record(rec); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + pti_view_record_memory_fill *rec = + reinterpret_cast(ptr); + runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id] = + rec->_name; + std::cout << "Found Memory Record" << '\n'; + samples_utils::dump_record(rec); + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + break; + } + case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + pti_view_record_kernel *rec = + reinterpret_cast(ptr); + runtime_enq_2_gpu_kernel_name_map[rec->_correlation_id] = + rec->_name; + std::cout << "Found Kernel Record" << '\n'; + samples_utils::dump_record(rec); + + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + if (samples_utils::isMonotonic( + {rec->_sycl_task_begin_timestamp, + rec->_sycl_enqk_begin_timestamp, rec->_append_timestamp, + rec->_submit_timestamp, rec->_start_timestamp, + rec->_end_timestamp})) { + std::cout << "------------> All Monotonic" << std::endl; + } else { + std::cerr + << "------------> Something wrong: NOT All monotonic" + << std::endl; + }; + if (rec->_sycl_task_begin_timestamp == 0) { + std::cerr << "------------> Something wrong: Sycl Task " + "Begin Time is 0" + << std::endl; + } + if (rec->_sycl_enqk_begin_timestamp == 0) { + std::cerr << "------------> Something wrong: Sycl Enq " + "Launch Kernel Time is 0" + << std::endl; + } + + break; + } + case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + pti_view_record_external_correlation *rec = + reinterpret_cast(ptr); + + external_corr_map[std::pair{rec->_external_kind, + rec->_external_id}] + .push_back(rec->_correlation_id); + samples_utils::dump_record(rec); + break; + } + case pti_view_kind::PTI_VIEW_OPENCL_CALLS: { + std::cout << "---------------------------------------------------" + "-----------------------------" + << '\n'; + pti_view_record_oclcalls *rec = + reinterpret_cast(ptr); + samples_utils::dump_record(rec); + break; + } + default: { + std::cerr << "This shouldn't happen" << '\n'; + break; + } + } + } + ::operator delete(buf); + }); + ptiViewSetOclProfiling(); + + ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL); + ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY); + ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL); + ptiViewEnable(PTI_VIEW_OPENCL_CALLS); + ptiViewEnable(PTI_VIEW_EXTERNAL_CORRELATION); + } + + ~GPUKernelTracer() { + gcLogD("Profiling is finished."); + ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL); + ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY); + ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL); + ptiViewEnable(PTI_VIEW_OPENCL_CALLS); + ptiViewDisable(PTI_VIEW_EXTERNAL_CORRELATION); + ptiFlushAllViews(); + } +}; + +/* +Create an RAII tracer with a static life cycle to trace all device kernel +execution during the program. When the tracer's constructor is called, the +EnableProfiling will also be called, registering some metric collection +call-back function into the opencl function call. When the tracer is destroyed, +the DisableProfiling is also called which will statistic the collected metric +during the tracer lifetime and print the result. The concrete implementation of +EnableProfiling and DisableProfiling could refer to +https://github.com/intel/pti-gpu/blob/master/tools/onetrace/tool.cc. +*/ +static GPUKernelTracer tracer; + +#endif + namespace mlir::gc::gpu { #define makeClErrPref(code) "OpenCL error ", code, ": " @@ -1014,4 +1193,4 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) { return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr) .first->second; } -} // namespace mlir::gc::gpu \ No newline at end of file +} // namespace mlir::gc::gpu diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/PtiGpuUtils.h b/lib/gc/ExecutionEngine/GPURuntime/ocl/PtiGpuUtils.h new file mode 100644 index 00000000..0711f9b4 --- /dev/null +++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/PtiGpuUtils.h @@ -0,0 +1,310 @@ +//===-- PtiGpuUtils.h - DESC ------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#ifndef SAMPLES_UTILS_H_ +#define SAMPLES_UTILS_H_ +#include +#include +#include + +#include "pti/pti_view.h" + +namespace samples_utils { + +inline constexpr auto kDefaultPtiBufferAlignment = std::align_val_t{1}; + +template +[[nodiscard]] inline T *AlignedAlloc(std::size_t size, std::align_val_t align) { + return static_cast(::operator new(size, align)); +} + +template +inline void AlignedDealloc(T *buf_ptr, std::align_val_t align) { + ::operator delete(buf_ptr, align); +} + +template [[nodiscard]] inline T *AlignedAlloc(std::size_t size) { + return AlignedAlloc(size, kDefaultPtiBufferAlignment); +} + +template inline void AlignedDealloc(T *buf_ptr) { + return AlignedDealloc(buf_ptr, kDefaultPtiBufferAlignment); +} + +template constexpr std::size_t ValidateTimestamps(T... args) { + using TimestampType = std::common_type_t; + constexpr auto count = sizeof...(args); + static_assert(count > 1, "Must provide more than one timestamp to validate"); + std::size_t found_issues = 0; + TimestampType prev_stamp = 0; + + // Use fold expressions to find issues with timestamps + // https://en.cppreference.com/w/cpp/language/fold + // this could probably be simplified if we do not care about the number of + // timestamp issues (we might be able to remove the lambda). + ( + [&] { + auto next_stamp = args; + if (!(prev_stamp <= next_stamp)) { + found_issues++; + } + prev_stamp = next_stamp; + }(), + ...); + return found_issues; +} + +// +// Returns: True - if a_list passed in is a monotonically increasing +// sequence. +// False - if not. +// Assumption: Operator <= is well defined for this type already. +// +template inline bool isMonotonic(std::initializer_list a_list) { + bool current_state = true; + T previous = a_list.begin()[0]; + for (auto item : a_list) { + current_state = current_state && (previous <= item); + if (!current_state) + return false; + previous = item; + } + return true; +} + +std::string stringify_uuid(uint8_t *uuid, std::string additional_string) { + std::stringstream sstream; + sstream << additional_string; + sstream << std::hex << std::setfill('0'); + for (uint32_t i = 1; i <= PTI_MAX_DEVICE_UUID_SIZE; ++i) { + sstream << std::setw(2); + sstream << static_cast(uuid[PTI_MAX_DEVICE_UUID_SIZE - i]); + if (i == 4 || i == 6 || i == 8 || i == 10) + sstream << "-"; + } + sstream << std::setfill(' ') << std::dec; + return sstream.str(); +} + +void print_uuid(uint8_t *uuid, std::string additional_string) { + std::cout << stringify_uuid(uuid, std::move(additional_string)) << std::endl; +} + +void dump_record(pti_view_record_kernel *record) { + if (nullptr == record) + return; + + std::cout << "Kernel Name: " << record->_name << '\n'; + std::cout << " Ze Kernel Append Time: " + << record->_append_timestamp << " ns" << '\n'; + std::cout << " Ze Kernel Submit Time: " + << record->_submit_timestamp << " ns" << '\n'; + std::cout << " Ze Kernel Start Time: " + << record->_start_timestamp << " ns" << '\n'; + std::cout << " Ze Kernel End Time: " + << record->_end_timestamp << " ns" << '\n'; + std::cout << "Kernel Queue Handle: " << record->_queue_handle << '\n'; + std::cout << "Kernel Queue ID: " << record->_sycl_queue_id << '\n'; + std::cout << "Kernel CommandList Context Handle: " << record->_context_handle + << '\n'; + std::cout << "Kernel Id: " << std::dec << record->_kernel_id << '\n'; + std::cout << "Correlation Id: " << std::dec << record->_correlation_id + << '\n'; + std::cout << "Kernel Thread Id: " << std::dec << record->_thread_id << '\n'; + std::cout << " Sycl Kernel Task Begin Time: " << std::dec + << record->_sycl_task_begin_timestamp << " ns" << '\n'; + std::cout << "Sycl Kernel EnqueueKernel Begin Time: " << std::dec + << record->_sycl_enqk_begin_timestamp << " ns" << '\n'; + std::cout << "Kernel Execution Time: " << std::dec + << record->_end_timestamp - record->_start_timestamp << " ns" + << '\n'; + std::cout << "Kernel File Name: " << record->_source_file_name << ":" + << record->_source_line_number << '\n'; + std::cout << "Kernel Device: " << record->_pci_address << '\n'; + print_uuid(record->_device_uuid, "Kernel Device UUID: "); + std::cout << "Kernel NodeID:InvocationID " << record->_sycl_node_id << ':' + << record->_sycl_invocation_id << '\n'; +} + +void dump_record(pti_view_record_memory_copy *record) { + if (nullptr == record) + return; + + std::cout << "Memory Op: " << record->_name << '\n'; + std::cout << "Memory Device: " << record->_pci_address << '\n'; + print_uuid(record->_device_uuid, "Memory Device UUID: "); + std::cout << "Memory Op Execution Time: " << std::dec + << record->_end_timestamp - record->_start_timestamp << " ns" + << '\n'; + std::cout << " Memory Op Append Time: " << std::dec + << record->_append_timestamp << " ns" << '\n'; + std::cout << " Memory Op Submit Time: " << std::dec + << record->_submit_timestamp << " ns" << '\n'; + std::cout << " Memory Op Start Time: " << std::dec + << record->_start_timestamp << " ns" << '\n'; + std::cout << " Memory Op End Time: " << std::dec + << record->_end_timestamp << " ns" << '\n'; + std::cout << "Memory Op Queue Handle: " << record->_queue_handle << '\n'; + std::cout << "Memory Op Queue ID: " << record->_sycl_queue_id << '\n'; + std::cout << "Memory Op CommandList Context Handle: " + << record->_context_handle << '\n'; + std::cout << "Memory Op Id: " << std::dec << record->_mem_op_id << '\n'; + std::cout << "Memory Bytes Copied: " << std::dec << record->_bytes << '\n'; + std::cout << "Memory Op Thread Id: " << std::dec << record->_thread_id + << '\n'; + std::cout << "Correlation Id: " << std::dec << record->_correlation_id + << '\n'; + std::cout << "Memory Copy Type: " << std::dec + << ptiViewMemcpyTypeToString(record->_memcpy_type) << '\n'; + std::cout << "Memory Copy Source: " << std::dec + << ptiViewMemoryTypeToString(record->_mem_src) << '\n'; + std::cout << "Memory Copy Destination: " << std::dec + << ptiViewMemoryTypeToString(record->_mem_dst) << '\n'; +} + +void dump_record(pti_view_record_memory_copy_p2p *record) { + if (nullptr == record) + return; + + std::cout << "Memory Op: " << record->_name << '\n'; + std::cout << "Memory Source Device: " << record->_src_pci_address << '\n'; + std::cout << "Memory Destination Device: " << record->_dst_pci_address + << '\n'; + print_uuid(record->_src_uuid, "Memory Source Device UUID: "); + print_uuid(record->_dst_uuid, "Memory Destination Device UUID: "); + std::cout << "Memory Op Execution Time: " << std::dec + << record->_end_timestamp - record->_start_timestamp << " ns" + << '\n'; + std::cout << " Memory Op Append Time: " << std::dec + << record->_append_timestamp << " ns" << '\n'; + std::cout << " Memory Op Submit Time: " << std::dec + << record->_submit_timestamp << " ns" << '\n'; + std::cout << " Memory Op Start Time: " << std::dec + << record->_start_timestamp << " ns" << '\n'; + std::cout << " Memory Op End Time: " << std::dec + << record->_end_timestamp << " ns" << '\n'; + std::cout << "Memory Op Queue Handle: " << record->_queue_handle << '\n'; + std::cout << "Memory Op Queue ID: " << record->_sycl_queue_id << '\n'; + std::cout << "Memory Op CommandList Context Handle: " + << record->_context_handle << '\n'; + std::cout << "Memory Op Id: " << std::dec << record->_mem_op_id << '\n'; + std::cout << "Memory Bytes Copied: " << std::dec << record->_bytes << '\n'; + std::cout << "Memory Op Thread Id: " << std::dec << record->_thread_id + << '\n'; + std::cout << "Correlation Id: " << std::dec << record->_correlation_id + << '\n'; + std::cout << "Memory Copy Type: " << std::dec + << ptiViewMemcpyTypeToString(record->_memcpy_type) << '\n'; + std::cout << "Memory Copy Source: " << std::dec + << ptiViewMemoryTypeToString(record->_mem_src) << '\n'; + std::cout << "Memory Copy Destination: " << std::dec + << ptiViewMemoryTypeToString(record->_mem_dst) << '\n'; +} + +void dump_record(pti_view_record_memory_fill *record) { + if (nullptr == record) + return; + + std::cout << "Memory Op: " << record->_name << '\n'; + std::cout << "Memory Device: " << record->_pci_address << '\n'; + print_uuid(record->_device_uuid, "Memory Device UUID: "); + std::cout << "Memory Op Execution Time: " << std::dec + << record->_end_timestamp - record->_start_timestamp << " ns" + << '\n'; + std::cout << " Memory Op Append Time: " << std::dec + << record->_append_timestamp << " ns" << '\n'; + std::cout << " Memory Op Submit Time: " << std::dec + << record->_submit_timestamp << " ns" << '\n'; + std::cout << " Memory Op Start Time: " << std::dec + << record->_start_timestamp << " ns" << '\n'; + std::cout << " Memory Op End Time: " << std::dec + << record->_end_timestamp << " ns" << '\n'; + std::cout << "Memory Op Queue Handle: " << record->_queue_handle << '\n'; + std::cout << "Memory Op Queue ID: " << record->_sycl_queue_id << '\n'; + std::cout << "Memory Op CommandList Context Handle: " + << record->_context_handle << '\n'; + std::cout << "Memory Op Id: " << std::dec << record->_mem_op_id << '\n'; + std::cout << "Memory Op Thread Id: " << std::dec << record->_thread_id + << '\n'; + std::cout << "Memory Bytes Transfered: " << record->_bytes << '\n'; + std::cout << "Memory Value for Set: " << record->_value_for_set << '\n'; + std::cout << "Correlation Id: " << std::dec << record->_correlation_id + << '\n'; + std::cout << "Memory Fill Type: " << std::dec << record->_mem_type << '\n'; +} + +void dump_record(pti_view_record_zecalls *record) { + if (nullptr == record) + return; + const char *pName = nullptr; + ptiViewGetCallbackIdName(record->_callback_id, &pName); + std::cout << "ZeCall Function Name: " << pName << '\n'; + std::cout << "ZeCall Function CBID: " << record->_callback_id << '\n'; + std::cout << "ZeCall Start Time: " << record->_start_timestamp << '\n'; + std::cout << " ZeCall End Time: " << record->_end_timestamp << '\n'; + std::cout << "ZeCall Process Id: " << record->_process_id << '\n'; + std::cout << "ZeCall Thread Id: " << record->_thread_id << '\n'; + std::cout << "ZeCall Correlation Id: " << record->_correlation_id << '\n'; +} + +void dump_record(pti_view_record_oclcalls *record) { + if (nullptr == record) + return; + const char *name = nullptr; + ptiViewGetCallbackIdName(record->_callback_id, &name); + std::cout << "OclCall Function Name: " << name << '\n'; + std::cout << "OclCall Function CBID: " << record->_callback_id << '\n'; + std::cout << "OclCall Start Time: " << record->_start_timestamp << '\n'; + std::cout << " OclCall End Time: " << record->_end_timestamp << '\n'; + std::cout << "OclCall Process Id: " << record->_process_id << '\n'; + std::cout << "OclCall Thread Id: " << record->_thread_id << '\n'; + std::cout << "OclCall Correlation Id: " << record->_correlation_id << '\n'; +} + +void dump_record(pti_view_record_sycl_runtime *record) { + if (nullptr == record) + return; + std::cout << "Sycl Function Name: " << record->_name << '\n'; + std::cout << "Sycl Start Time: " << record->_start_timestamp << '\n'; + std::cout << "Sycl End Time: " << record->_end_timestamp << '\n'; + std::cout << "Sycl Process Id: " << record->_process_id << '\n'; + std::cout << "Sycl Thread Id: " << record->_thread_id << '\n'; + std::cout << "Sycl Correlation Id: " << record->_correlation_id << '\n'; +} + +void dump_record(pti_view_record_overhead *record) { + if (nullptr == record) + return; + std::cout << "Overhead Kind : " + << ptiViewOverheadKindToString(record->_overhead_kind) << '\n'; + std::cout << "Overhead Time Duration(ns): " << record->_overhead_duration_ns + << '\n'; + std::cout << "Overhead Count: " << record->_overhead_count << '\n'; + std::cout << "Overhead ApiId: " << record->_api_id << '\n'; + std::cout << "Overhead Start Timestamp(ns): " + << record->_overhead_start_timestamp_ns << '\n'; + std::cout << "Overhead End Timestamp(ns): " + << record->_overhead_end_timestamp_ns << '\n'; + std::cout << "Overhead ThreadId: " << record->_overhead_thread_id << '\n'; + // std::cout << "Overhead API Responsible : " + //<< record->_overhead_api_name << '\n'; +} + +void dump_record(pti_view_record_external_correlation *record) { + if (nullptr == record) + return; + std::cout << "External Correlation Kind : " << record->_external_kind << '\n'; + std::cout << "Correlation Id: " << record->_correlation_id << '\n'; + std::cout << "External Id: " << record->_external_id << '\n'; +} +} // namespace samples_utils +#endif