Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kernel time tracing support for gpu #381

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
option(GC_ENABLE_RUNTIME_NAIVE_BRGEMM "Use naive BRGEMM as runtime backend for debug purpose." OFF)
option(GC_BENCH_ENABLE "Build benchgc." ON)
option(GC_ENABLE_GPU_PROFILE "Enable the GPU kernel profiling." OFF)

if(GC_ENABLE_LEGACY)
add_subdirectory(legacy/core)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,5 @@ Graph Compiler supports the following build-time options.
| GC_DEV_LINK_LLVM_DYLIB | ON, **OFF** | Controls dynamic link LLVM/MLIR libraries, mainly for developer |
| GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF | Controls building the Python API |
| GC_ENABLE_IMEX | ON, **OFF** | Whether to enable the GPU components |
| GC_ENABLE_GPU_PROFILE | ON, **OFF** | Whether to enable the GPU profiling which will profile the kernel execution time |
zhczhong marked this conversation as resolved.
Show resolved Hide resolved

9 changes: 9 additions & 0 deletions cmake/ptigpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
include_guard()

FetchContent_Declare(
PTIGPU
GIT_REPOSITORY https://github.com/intel/pti-gpu.git
GIT_TAG exp_opencl_0.11.0
SOURCE_SUBDIR sdk
)
FetchContent_MakeAvailable(PTIGPU)
7 changes: 7 additions & 0 deletions lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,10 @@ gc_add_mlir_library(GcGpuOclRuntime
)
target_include_directories(GcGpuOclRuntime PUBLIC ${OpenCL_INCLUDE_DIRS})
target_link_libraries(GcGpuOclRuntime PUBLIC ${OpenCL_LIBRARIES})

if(GC_ENABLE_GPU_PROFILE)
include(ptigpu)
find_package(Pti REQUIRED)
target_link_libraries(GcGpuOclRuntime PRIVATE Pti::pti_view)
add_definitions(-DGC_ENABLE_GPU_PROFILE)
endif ()
181 changes: 180 additions & 1 deletion lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,185 @@
#include "mlir/Interfaces/DataLayoutInterfaces.h"
#include "mlir/Pass/PassManager.h"

#ifdef GC_ENABLE_GPU_PROFILE
#include "PtiGpuUtils.h"
#include "pti/pti_view.h"
std::map<std::pair<pti_view_external_kind, uint64_t>, std::vector<uint32_t>>
external_corr_map;
std::map<uint32_t, std::string> runtime_enq_2_gpu_kernel_name_map;
std::map<uint32_t, std::string> runtime_enq_2_gpu_mem_op_name_map;

class GPUKernelTracer {
public:
GPUKernelTracer() {
gcLogD("Enable Profiling.");
ptiViewSetCallbacks(
[](auto **buf, auto *buf_size) {
*buf_size = sizeof(pti_view_record_kernel) * 100;
void *ptr = ::operator new(*buf_size);
ptr = std::align(8, sizeof(unsigned char), ptr, *buf_size);
*buf = reinterpret_cast<unsigned char *>(ptr);
if (!*buf) {
std::abort();
}
return;
},
[](auto *buf, auto buf_size, auto valid_buf_size) {
if (!buf_size || !valid_buf_size || !buf_size) {
std::cerr << "Received empty buffer" << '\n';
if (valid_buf_size) {
::operator delete(buf);
}
return;
}
pti_view_record_base *ptr = nullptr;
while (true) {
auto buf_status = ptiViewGetNextRecord(buf, valid_buf_size, &ptr);
if (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) {
std::cout << "Reached End of buffer" << '\n';
break;
}
if (buf_status != pti_result::PTI_SUCCESS) {
std::cerr << "Found Error Parsing Records from PTI" << '\n';
break;
}
switch (ptr->_view_kind) {
case pti_view_kind::PTI_VIEW_INVALID: {
std::cout << "Found Invalid Record" << '\n';
break;
}
case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: {
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
pti_view_record_memory_copy *rec =
reinterpret_cast<pti_view_record_memory_copy *>(ptr);
runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id] =
rec->_name;
std::cout << "Found Memory Record" << '\n';
samples_utils::dump_record(rec);
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
break;
}
case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: {
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
pti_view_record_memory_fill *rec =
reinterpret_cast<pti_view_record_memory_fill *>(ptr);
runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id] =
rec->_name;
std::cout << "Found Memory Record" << '\n';
samples_utils::dump_record(rec);
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
break;
}
case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
pti_view_record_kernel *rec =
reinterpret_cast<pti_view_record_kernel *>(ptr);
runtime_enq_2_gpu_kernel_name_map[rec->_correlation_id] =
rec->_name;
std::cout << "Found Kernel Record" << '\n';
samples_utils::dump_record(rec);

std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
if (samples_utils::isMonotonic(
{rec->_sycl_task_begin_timestamp,
rec->_sycl_enqk_begin_timestamp, rec->_append_timestamp,
rec->_submit_timestamp, rec->_start_timestamp,
rec->_end_timestamp})) {
std::cout << "------------> All Monotonic" << std::endl;
} else {
std::cerr
<< "------------> Something wrong: NOT All monotonic"
<< std::endl;
};
if (rec->_sycl_task_begin_timestamp == 0) {
std::cerr << "------------> Something wrong: Sycl Task "
"Begin Time is 0"
<< std::endl;
}
if (rec->_sycl_enqk_begin_timestamp == 0) {
std::cerr << "------------> Something wrong: Sycl Enq "
"Launch Kernel Time is 0"
<< std::endl;
}

break;
}
case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
pti_view_record_external_correlation *rec =
reinterpret_cast<pti_view_record_external_correlation *>(ptr);

external_corr_map[std::pair{rec->_external_kind,
rec->_external_id}]
.push_back(rec->_correlation_id);
samples_utils::dump_record(rec);
break;
}
case pti_view_kind::PTI_VIEW_OPENCL_CALLS: {
std::cout << "---------------------------------------------------"
"-----------------------------"
<< '\n';
pti_view_record_oclcalls *rec =
reinterpret_cast<pti_view_record_oclcalls *>(ptr);
samples_utils::dump_record(rec);
break;
}
default: {
std::cerr << "This shouldn't happen" << '\n';
break;
}
}
}
::operator delete(buf);
});
ptiViewSetOclProfiling();

ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL);
ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY);
ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL);
ptiViewEnable(PTI_VIEW_OPENCL_CALLS);
ptiViewEnable(PTI_VIEW_EXTERNAL_CORRELATION);
}

~GPUKernelTracer() {
gcLogD("Profiling is finished.");
ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL);
ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY);
ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL);
ptiViewEnable(PTI_VIEW_OPENCL_CALLS);
ptiViewDisable(PTI_VIEW_EXTERNAL_CORRELATION);
ptiFlushAllViews();
}
};

/*
Create an RAII tracer with a static life cycle to trace all device kernel
execution during the program. When the tracer's constructor is called, the
EnableProfiling will also be called, registering some metric collection
call-back function into the opencl function call. When the tracer is destroyed,
the DisableProfiling is also called which will statistic the collected metric
during the tracer lifetime and print the result. The concrete implementation of
EnableProfiling and DisableProfiling could refer to
https://github.com/intel/pti-gpu/blob/master/tools/onetrace/tool.cc.
*/
static GPUKernelTracer tracer;

#endif

namespace mlir::gc::gpu {

#define makeClErrPref(code) "OpenCL error ", code, ": "
Expand Down Expand Up @@ -1014,4 +1193,4 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr)
.first->second;
}
} // namespace mlir::gc::gpu
} // namespace mlir::gc::gpu
Loading
Loading