From 3ed8730f031b18560130658adef4eec349e6837c Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Tue, 10 Sep 2024 11:53:47 +0200 Subject: [PATCH] Implemented GPU OpenCL runtime --- .github/workflows/clang-tidy.yml | 2 +- CMakeLists.txt | 3 + cmake/imex.cmake | 1 + .../GPURuntime/GpuOclRuntime.h | 455 +++++++++- include/gc/Transforms/Passes.h | 14 +- include/gc/Transforms/Passes.td | 5 + include/gc/Utils/Error.h | 58 ++ include/gc/Utils/Log.h | 51 ++ lib/gc/ExecutionEngine/CMakeLists.txt | 1 + .../ExecutionEngine/GPURuntime/CMakeLists.txt | 1 + .../GPURuntime/ocl/CMakeLists.txt | 10 + .../GPURuntime/ocl/GpuOclRuntime.cpp | 838 ++++++++++++++++++ lib/gc/Transforms/GPU/CMakeLists.txt | 21 +- lib/gc/Transforms/GPU/GpuToGpuOcl.cpp | 30 +- lib/gc/Transforms/GPU/Pipeline.cpp | 48 +- src/gc-opt/CMakeLists.txt | 1 - .../test/gc/Transforms/GPU/gpu-to-gpuocl.mlir | 2 +- test/mlir/test/gc/gpu-runner/lit.local.cfg | 4 +- .../unittests/ExecutionEngine/CMakeLists.txt | 6 +- .../ExecutionEngine/GPU/CMakeLists.txt | 8 + .../ExecutionEngine/GPU/GpuOclRuntimeTest.cpp | 232 +++++ 21 files changed, 1740 insertions(+), 51 deletions(-) create mode 100644 include/gc/Utils/Error.h create mode 100644 include/gc/Utils/Log.h create mode 100644 lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt create mode 100644 lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt create mode 100644 lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp create mode 100644 test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt create mode 100644 test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp diff --git a/.github/workflows/clang-tidy.yml b/.github/workflows/clang-tidy.yml index 59595c00a..52b445901 100644 --- a/.github/workflows/clang-tidy.yml +++ b/.github/workflows/clang-tidy.yml @@ -14,7 +14,7 @@ jobs: steps: - name: Install OpenMP - run: "sudo apt install -y libomp-dev" + run: "sudo apt install -y libomp-dev opencl-c-headers" - name: Fetch sources uses: actions/checkout@v4 diff --git a/CMakeLists.txt b/CMakeLists.txt index 538076515..66af39632 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,12 +98,15 @@ get_property(GC_TOOLS GLOBAL PROPERTY GC_TOOLS) get_property(GC_MLIR_LIBS GLOBAL PROPERTY GC_MLIR_LIBS) get_property(GC_PASS_LIBS GLOBAL PROPERTY GC_PASS_LIBS) get_property(GC_DIALECT_LIBS GLOBAL PROPERTY GC_DIALECT_LIBS) +get_property(IMEX_LIBS GLOBAL PROPERTY IMEX_LIBS) + install(TARGETS GcInterface ${GC_TOOLS} ${GC_MLIR_LIBS} ${GC_PASS_LIBS} ${GC_DIALECT_LIBS} + ${IMEX_LIBS} EXPORT ${PROJECT_NAME}Targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/cmake/imex.cmake b/cmake/imex.cmake index c698945c4..f36f5c391 100644 --- a/cmake/imex.cmake +++ b/cmake/imex.cmake @@ -24,4 +24,5 @@ if (NOT DEFINED IMEX_INCLUDES) ${imex_SOURCE_DIR}/src ) set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES}) + target_compile_options(GcInterface INTERFACE -DGC_USE_IMEX) endif () diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h index b01b9f2c6..c54f0d94e 100644 --- a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h +++ b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h @@ -10,19 +10,470 @@ #define GC_GPUOCLRUNTIME_H namespace mlir::gc::gpu { -constexpr char GPU_OCL_MALLOC[] = "gcGpuOclMalloc"; +constexpr char GPU_OCL_MALLOC_DEV[] = "gcGpuOclMallocDev"; +constexpr char GPU_OCL_MALLOC_SHARED[] = "gcGpuOclMallocShared"; constexpr char GPU_OCL_DEALLOC[] = "gcGpuOclDealloc"; constexpr char GPU_OCL_MEMCPY[] = "gcGpuOclMemcpy"; constexpr char GPU_OCL_KERNEL_CREATE[] = "gcGpuOclKernelCreate"; constexpr char GPU_OCL_KERNEL_DESTROY[] = "gcGpuOclKernelDestroy"; constexpr char GPU_OCL_KERNEL_LAUNCH[] = "gcGpuOclKernelLaunch"; +constexpr char GPU_OCL_FINISH[] = "gcGpuOclFinish"; constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor"; } // namespace mlir::gc::gpu #ifndef GC_GPU_OCL_CONST_ONLY +#include +#include +#include -// TBD +#include +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/ExecutionEngine/ExecutionEngine.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/PassManager.h" + +namespace mlir::gc::gpu { +struct OclDevCtxPair { + cl_device_id device; + cl_context context; + explicit OclDevCtxPair(cl_device_id device, cl_context context) + : device(device), context(context) {} + + bool operator==(const OclDevCtxPair &other) const { + return device == other.device && context == other.context; + } +}; +} // namespace mlir::gc::gpu +template <> struct std::hash { + std::size_t + operator()(const mlir::gc::gpu::OclDevCtxPair &pair) const noexcept { + return std::hash()(pair.device) ^ + std::hash()(pair.context); + } +}; // namespace std +namespace mlir::gc::gpu { +struct OclModule; +struct OclContext; +struct OclModuleBuilder; + +struct OclRuntime { + // Returns the available Intel GPU device ids. + [[nodiscard]] static llvm::Expected> + gcIntelDevices(size_t max = std::numeric_limits::max()); + + [[nodiscard]] static llvm::Expected get(); + + [[nodiscard]] static llvm::Expected get(cl_device_id device); + + [[nodiscard]] static llvm::Expected get(cl_command_queue queue); + + [[nodiscard]] static llvm::Expected get(cl_device_id device, + cl_context context); + + static bool isOutOfOrder(cl_command_queue queue); + + [[nodiscard]] cl_context getContext() const; + + [[nodiscard]] cl_device_id getDevice() const; + + [[nodiscard]] llvm::Expected + createQueue(bool outOfOrder = false) const; + + [[nodiscard]] static llvm::Expected + releaseQueue(cl_command_queue queue); + + [[nodiscard]] llvm::Expected usmAllocDev(size_t size) const; + + [[nodiscard]] llvm::Expected usmAllocShared(size_t size) const; + + [[nodiscard]] llvm::Expected usmFree(const void *ptr) const; + + [[nodiscard]] llvm::Expected usmCpy(OclContext &ctx, const void *src, + void *dst, size_t size) const; + + template + [[nodiscard]] llvm::Expected usmNewDev(size_t size) const { + auto expected = usmAllocDev(size * sizeof(T)); + if (expected) { + return static_cast(*expected); + } + return expected.takeError(); + } + + template + [[nodiscard]] llvm::Expected usmNewShared(size_t size) const { + auto expected = usmAllocShared(size * sizeof(T)); + if (expected) { + return static_cast(*expected); + } + return expected.takeError(); + } + + template + [[nodiscard]] llvm::Expected usmCpy(OclContext &ctx, const T *src, + T *dst, size_t size) const { + return usmCpy(ctx, static_cast(src), static_cast(dst), + size * sizeof(T)); + } + + // Use with caution! This is safe to check validity of USM, but may be false + // positive for any other kinds. + bool isUsm(const void *ptr) const; + + bool operator==(const OclRuntime &other) const { + return getDevice() == other.getDevice() && + getContext() == other.getContext(); + } + +private: + struct Ext; + struct Exports; + friend OclContext; + friend OclModuleBuilder; + template friend struct DynamicExecutor; + template friend struct StaticExecutor; + explicit OclRuntime(const Ext &ext); + const Ext &ext; + +#ifndef NDEBUG + static void debug(const char *file, int line, const char *msg); +#endif +}; + +static constexpr int64_t ZERO = 0; +static constexpr auto ZERO_PTR = const_cast(&ZERO); + +// NOTE: The context is mutable and not thread-safe! It's expected to be used in +// a single thread only. +struct OclContext { + const OclRuntime &runtime; + const cl_command_queue queue; + // Preserve the execution order. This is required in case of out-of-order + // execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution + // is completed, the 'lastEvent' field contains the event of the last enqueued + // command. If this field is false, 'waitList' is ignored. + const bool preserveOrder; + cl_uint waitListLen; + cl_event *waitList; + cl_event lastEvent; + + explicit OclContext(const OclRuntime &runtime, cl_command_queue queue, + cl_uint waitListLen = 0, cl_event *waitList = nullptr) + : OclContext(runtime, queue, OclRuntime::isOutOfOrder(queue), waitListLen, + waitList) {} + + explicit OclContext(const OclRuntime &runtime, cl_command_queue queue, + bool preserveOrder, cl_uint waitListLen, + cl_event *waitList) + : runtime(runtime), queue(queue), preserveOrder(preserveOrder), + waitListLen(preserveOrder ? waitListLen : 0), + waitList(preserveOrder ? waitList : nullptr), lastEvent(nullptr), + clPtrs(nullptr) { + assert(!OclRuntime::isOutOfOrder(queue) || preserveOrder); + assert(preserveOrder || (waitListLen == 0 && waitList == nullptr)); + } + + OclContext(const OclContext &) = delete; + OclContext &operator=(const OclContext &) = delete; + + [[nodiscard]] llvm::Expected finish(); + +private: + friend OclRuntime; + friend OclRuntime::Exports; + template friend struct DynamicExecutor; + template friend struct StaticExecutor; + std::unordered_set *clPtrs; + + void setLastEvent(cl_event event) { + lastEvent = event; + if (event) { + waitListLen = 1; + waitList = &lastEvent; + } else { + waitListLen = 0; + waitList = nullptr; + } + } +}; + +struct OclModule { + const OclRuntime runtime; + // If all the function arguments have static shapes, then this field is true + // and main.staticMain is used. Otherwise, main.wrappedMain is used. + const bool isStatic; + + ~OclModule(); + OclModule(const OclModule &) = delete; + OclModule &operator=(const OclModule &) = delete; + OclModule(const OclModule &&) = delete; + OclModule &operator=(const OclModule &&) = delete; + + void dumpToObjectFile(StringRef filename) const { + engine->dumpToObjectFile(filename); + } + +private: + friend OclModuleBuilder; + template friend struct DynamicExecutor; + template friend struct OclModuleExecutorBase; + template friend struct StaticExecutor; + // This function is only created when all args are memrefs with static shape. + using StaticMainFunc = void (*)(OclContext *, void **); + // Wrapper, generated by the engine. The arguments are pointers to the values. + using WrappedMainFunc = void (*)(void **); + union MainFunc { + StaticMainFunc staticMain; + WrappedMainFunc wrappedMain; + }; + const MainFunc main; + const ArrayRef argTypes; + std::unique_ptr engine; + + explicit OclModule(const OclRuntime &runtime, const bool isStatic, + const MainFunc main, const ArrayRef argTypes, + std::unique_ptr engine) + : runtime(runtime), isStatic(isStatic), main(main), argTypes(argTypes), + engine(std::move(engine)) {} +}; + +struct OclModuleBuilderOpts { + StringRef funcName = {}; + bool enableObjectDump = false; + ArrayRef sharedLibPaths = {}; + void (*pipeline)(OpPassManager &) = nullptr; +}; + +struct OclModuleBuilder { + friend OclRuntime; + explicit OclModuleBuilder(ModuleOp module, + const OclModuleBuilderOpts &opts = {}); + explicit OclModuleBuilder(OwningOpRef &module, + const OclModuleBuilderOpts &opts = {}) + : OclModuleBuilder(module.release(), opts) {} + explicit OclModuleBuilder(OwningOpRef &&module, + const OclModuleBuilderOpts &opts = {}) + : OclModuleBuilder(module.release(), opts) {} + + llvm::Expected> + build(const OclRuntime &runtime); + + llvm::Expected> + build(cl_command_queue queue); + + llvm::Expected> build(cl_device_id device, + cl_context context); + +private: + ModuleOp mlirModule; + const bool enableObjectDump; + const ArrayRef sharedLibPaths; + void (*const pipeline)(OpPassManager &); + const StringRef funcName; + const ArrayRef argTypes; + std::shared_mutex mux; + std::unordered_map> + cache; + + llvm::Expected> + build(const OclRuntime::Ext &ext); +}; + +// NOTE: This class is mutable and not thread-safe! +template struct OclModuleExecutorBase { + + void reset() { + args.clear(); + clPtrs.clear(); + argCounter = 0; + } + + Type getArgType(unsigned idx) const { return mod->argTypes[idx]; } + + [[nodiscard]] bool isSmall() const { return args.small(); } + +protected: + struct Args : SmallVector { + [[nodiscard]] bool small() const { return this->isSmall(); } + }; + + const std::shared_ptr &mod; + // Contains the pointers of all non-USM arguments. It's expected, that the + // arguments are either USM or CL pointers and most probably are USM, thus, + // in most cases, this set will be empty. + std::unordered_set clPtrs; + Args args; + unsigned argCounter = 0; + + explicit OclModuleExecutorBase(std::shared_ptr &mod) + : mod(mod) {} + +#ifndef NDEBUG + void checkExec(const OclContext &ctx) const { + auto rt = OclRuntime::get(ctx.queue); + assert(rt); + assert(*rt == mod->runtime); + assert(argCounter == mod->argTypes.size()); + } + + void checkArg(const void *alignedPtr, bool isUsm = true) const { + assert(!isUsm || mod->runtime.isUsm(alignedPtr)); + } +#endif +}; + +// NOTE: This executor can only be used if mod->isStatic == true! +template struct StaticExecutor : OclModuleExecutorBase { + explicit StaticExecutor(std::shared_ptr &mod) + : OclModuleExecutorBase(mod) { + assert(this->mod->isStatic); + } + + void exec(OclContext &ctx) { +#ifndef NDEBUG + this->checkExec(ctx); +#endif + ctx.clPtrs = &this->clPtrs; + this->mod->main.staticMain(&ctx, this->args.data()); + } + + void arg(void *alignedPtr, bool isUsm = true) { +#ifndef NDEBUG + this->checkArg(alignedPtr, isUsm); + std::ostringstream oss; + oss << "Arg" << this->argCounter << ": alignedPtr=" << alignedPtr + << ", isUsm=" << (isUsm ? "true" : "false"); + OclRuntime::debug(__FILE__, __LINE__, oss.str().c_str()); +#endif + ++this->argCounter; + this->args.emplace_back(alignedPtr); + if (!isUsm) { + this->clPtrs.insert(alignedPtr); + } + } + + template void arg(T *alignedPtr, bool isUsm = true) { + arg(reinterpret_cast(alignedPtr), isUsm); + } + + void operator()(OclContext &ctx) { exec(ctx); } + + template void operator()(OclContext &ctx, T *ptr1, ...) { + { + this->reset(); + arg(reinterpret_cast(ptr1)); + va_list args; + va_start(args, ptr1); + for (unsigned i = 0, n = this->mod->argTypes.size() - 1; i < n; i++) { + arg(va_arg(args, void *)); + } + va_end(args); + exec(ctx); + } + } +}; + +// The main function arguments are added in the following format - +// https://mlir.llvm.org/docs/TargetLLVMIR/#c-compatible-wrapper-emission. +// NOTE: This executor can only be used if mod->isStatic != true! +template struct DynamicExecutor : OclModuleExecutorBase { + explicit DynamicExecutor(std::shared_ptr &mod) + : OclModuleExecutorBase(mod) { + assert(!this->mod->isStatic); + } + + void exec(OclContext &ctx) { +#ifndef NDEBUG + this->checkExec(ctx); +#endif + auto size = this->args.size(); + auto ctxPtr = &ctx; + this->args.emplace_back(&ctxPtr); + this->args.emplace_back(&ctxPtr); + this->args.emplace_back(ZERO_PTR); + this->mod->main.wrappedMain(this->args.data()); + this->args.truncate(size); + } + + void arg(void *&alignedPtr, size_t rank, const int64_t *shape, + const int64_t *strides, bool isUsm = true) { + arg(alignedPtr, alignedPtr, ZERO, rank, shape, strides, isUsm); + } + + // NOTE: The argument values are not copied, only the pointers are stored! + void arg(void *&allocatedPtr, void *&alignedPtr, const int64_t &offset, + size_t rank, const int64_t *shape, const int64_t *strides, + bool isUsm = true) { +#ifndef NDEBUG + this->checkArg(alignedPtr, isUsm); + if (auto type = + llvm::dyn_cast(this->getArgType(this->argCounter))) { + if (type.hasStaticShape()) { + auto size = type.getShape(); + assert(rank == size.size()); + for (size_t i = 0; i < rank; i++) { + assert(shape[i] == size[i]); + } + + SmallVector expectedStrides; + if (int64_t expectedOffset; !failed( + getStridesAndOffset(type, expectedStrides, expectedOffset))) { + assert(expectedOffset == offset); + for (size_t i = 0; i < rank; i++) { + assert(expectedStrides[i] == strides[i]); + } + } + } + } + + std::ostringstream oss; + oss << "Arg" << this->argCounter << ": ptr=" << allocatedPtr + << ", alignedPtr=" << alignedPtr + << ", isUsm=" << (isUsm ? "true" : "false") << ", offset=" << offset + << ", shape=["; + for (unsigned i = 0; i < rank; i++) { + oss << shape[i] << (i + 1 < rank ? ", " : "]"); + } + oss << ", strides=["; + for (unsigned i = 0; i < rank; i++) { + oss << strides[i] << (i + 1 < rank ? ", " : "]"); + } + OclRuntime::debug(__FILE__, __LINE__, oss.str().c_str()); +#endif + + ++this->argCounter; + this->args.emplace_back(&allocatedPtr); + this->args.emplace_back(&alignedPtr); + this->args.emplace_back(const_cast(&offset)); + for (size_t i = 0; i < rank; i++) { + this->args.emplace_back(const_cast(&shape[i])); + } + for (size_t i = 0; i < rank; i++) { + this->args.emplace_back(const_cast(&strides[i])); + } + if (!isUsm) { + this->clPtrs.insert(alignedPtr); + } + } + + template + void arg(T *&alignedPtr, size_t rank, const int64_t *shape, + const int64_t *strides, bool isUsm = true) { + arg(reinterpret_cast(alignedPtr), rank, shape, strides, isUsm); + } + + template + void arg(T *&allocatedPtr, T *&alignedPtr, const int64_t &offset, size_t rank, + const int64_t *shape, const int64_t *strides, bool isUsm = true) { + arg(reinterpret_cast(allocatedPtr), + reinterpret_cast(alignedPtr), offset, rank, shape, strides, + isUsm); + } + + void operator()(OclContext &ctx) { exec(ctx); } +}; +} // namespace mlir::gc::gpu #else #undef GC_GPU_OCL_CONST_ONLY #endif diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h index 558bd9b16..c468f22b3 100644 --- a/include/gc/Transforms/Passes.h +++ b/include/gc/Transforms/Passes.h @@ -10,6 +10,7 @@ #define GC_PASSES_H #include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassOptions.h" namespace mlir { class OpBuilder; @@ -115,7 +116,18 @@ void populateFrontendPasses(mlir::OpPassManager &); void populateCPUPipeline(mlir::OpPassManager &); #ifdef GC_USE_IMEX -struct GPUPipelineOption; +struct GPUPipelineOption : PassPipelineOptions { + Option isUsmArgs{ + *this, "is-usm-args", + llvm::cl::desc("Whether to use USM(unified shared memory) func args, in " + "which the host and device could access the same buffer " + "and there is no need to add memcpy explicitly."), + llvm::cl::init(true)}; + Option callFinish{ + *this, "call-finish", + llvm::cl::desc("Call finish() after each GPU kernel launch."), + llvm::cl::init(false)}; +}; void populateGPUPipeline(mlir::OpPassManager &, const GPUPipelineOption &); #endif diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index 2ddf0a06e..fb5581bb5 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -106,6 +106,11 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> { let description = [{ Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls. }]; + let options = [ + Option<"callFinish", "call-finish", "bool", + /*default=*/"false", + "Call finish() after each kernel launch."> + ]; } #endif // GC_USE_IMEX diff --git a/include/gc/Utils/Error.h b/include/gc/Utils/Error.h new file mode 100644 index 000000000..6694e14fe --- /dev/null +++ b/include/gc/Utils/Error.h @@ -0,0 +1,58 @@ +//===-- Error.h - Error processing functions --------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef GC_ERROR_H +#define GC_ERROR_H + +#include + +#include "gc/Utils/Log.h" + +#include "llvm/Support/Error.h" + +namespace mlir::gc::err { +#ifdef NDEBUG +#define GC_ERR_LOC_DECL +#define GC_ERR_LOC_ARGS +#define GC_ERR_LOC +#else +#define GC_ERR_LOC_DECL const char *fileName, int lineNum, +#define GC_ERR_LOC_ARGS fileName, lineNum, +#define GC_ERR_LOC __FILE__, __LINE__, +#endif + +#define gcMakeErr(...) mlir::gc::err::makeLlvmError(GC_ERR_LOC __VA_ARGS__) +#define gcReportErr(...) \ + mlir::gc::err::report(GC_ERR_LOC gcMakeErr(__VA_ARGS__)) +#define gcGetOrReport(expected) mlir::gc::err::getOrReport(GC_ERR_LOC expected) + +template +[[nodiscard]] llvm::Error makeLlvmError(GC_ERR_LOC_DECL Args... args) { + log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR", args...); + std::ostringstream oss; + log::insertArgs(oss, args...); + auto msg = oss.str(); + return llvm::make_error(msg.substr(0, msg.length() - 1), + llvm::inconvertibleErrorCode()); +} + +[[noreturn]] static void report(GC_ERR_LOC_DECL llvm::Error err) { + log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR", "Unrecoverable error!"); + report_fatal_error(std::move(err)); +} + +template +T getOrReport(GC_ERR_LOC_DECL llvm::Expected expected) { + if (expected) { + return *expected; + } + report(GC_ERR_LOC_ARGS expected.takeError()); +} +} // namespace mlir::gc::err + +#endif diff --git a/include/gc/Utils/Log.h b/include/gc/Utils/Log.h new file mode 100644 index 000000000..dc7bfc6f8 --- /dev/null +++ b/include/gc/Utils/Log.h @@ -0,0 +1,51 @@ +//===-- Log.h - Logging functions -------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef GC_LOG_H +#define GC_LOG_H +#include + +namespace mlir::gc::log { +static void insertArgs(std::ostream &stream) { stream << std::endl; } + +template +static void insertArgs(std::ostream &stream, T first, Args... args) { + stream << first; + insertArgs(stream, args...); +} + +template +static void insetLog( +#ifndef NDEBUG + const char *fileName, int lineNum, +#endif + std::ostream &stream, const char *pref, Args... args) { + stream << "[" << pref << "] "; +#ifndef NDEBUG + stream << "[" << fileName << ":" << lineNum << "] "; +#endif + insertArgs(stream, args...); +} + +#ifdef NDEBUG +#define gcLogD(...) +#define gcLogE(...) mlir::gc::log::insetLog(std::cerr, "ERROR", __VA_ARGS__) +#else +#define _insetLog(stream, pref, ...) \ + mlir::gc::log::insetLog(__FILE__, __LINE__, stream, pref, __VA_ARGS__) +#define gcLogD(...) _insetLog(std::cout, "DEBUG", __VA_ARGS__) +#define gcLogE(...) _insetLog(std::cerr, "ERROR", __VA_ARGS__) +#endif +} // namespace mlir::gc::log + +#ifdef GC_LOG_NO_DEBUG +#undef gcLogD +#define gcLogD(...) +#endif + +#endif diff --git a/lib/gc/ExecutionEngine/CMakeLists.txt b/lib/gc/ExecutionEngine/CMakeLists.txt index f13a27b1a..0985f1536 100644 --- a/lib/gc/ExecutionEngine/CMakeLists.txt +++ b/lib/gc/ExecutionEngine/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(CPURuntime) add_subdirectory(Driver) if(GC_ENABLE_IMEX) + add_subdirectory(GPURuntime) add_subdirectory(OpenCLRuntime) endif() \ No newline at end of file diff --git a/lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt new file mode 100644 index 000000000..8aeab1db4 --- /dev/null +++ b/lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(ocl) diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt b/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt new file mode 100644 index 000000000..37cd56b85 --- /dev/null +++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt @@ -0,0 +1,10 @@ +find_package(OpenCL REQUIRED) + +gc_add_mlir_library(GcGpuOclRuntime + GpuOclRuntime.cpp + LINK_LIBS PUBLIC + GcInterface + GcGpuPasses +) +target_include_directories(GcGpuOclRuntime PUBLIC ${OpenCL_INCLUDE_DIRS}) +target_link_libraries(GcGpuOclRuntime PUBLIC ${OpenCL_LIBRARIES}) diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp new file mode 100644 index 000000000..90a6b5cf5 --- /dev/null +++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp @@ -0,0 +1,838 @@ +//===-- GpuOclRuntime.cpp - GPU OpenCL Runtime ------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +// Comment out the following line to enable debug logging +// #define GC_LOG_NO_DEBUG +#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h" +#include "gc/Transforms/Passes.h" +#include "gc/Utils/Error.h" +#include "gc/Utils/Log.h" + +#include "llvm/ExecutionEngine/Orc/LLJIT.h" +#include "llvm/Support/Error.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Pass/PassManager.h" + +namespace mlir::gc::gpu { + +#define makeClErrPref(code) "OpenCL error ", code, ": " +#define makeClErr(code, ...) gcMakeErr(makeClErrPref(code), __VA_ARGS__) +#define reportClErr(code, ...) gcReportErr(makeClErrPref(code), __VA_ARGS__) + +#define CHECK(cond, ...) \ + do { \ + if (!(cond)) \ + return gcMakeErr(__VA_ARGS__); \ + } while (0) +#define CHECKE(expected, ...) \ + do { \ + if (!expected) { \ + gcLogE(__VA_ARGS__); \ + return expected.takeError(); \ + } \ + } while (0) +#define CL_CHECK(expr, ...) \ + do { \ + if (auto _cl_check_err = (expr); _cl_check_err != CL_SUCCESS) \ + return makeClErr(_cl_check_err, __VA_ARGS__); \ + } while (0) +#define CL_CHECKR(expr, ...) \ + do { \ + if (auto _cl_check_err = (expr); _cl_check_err != CL_SUCCESS) { \ + reportClErr(_cl_check_err, __VA_ARGS__); \ + } \ + } while (0) + +// cl_ext function pointers +struct OclRuntime::Ext : OclDevCtxPair { + clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL; + clSharedMemAllocINTEL_fn clSharedMemAllocINTEL; + clMemFreeINTEL_fn clMemFreeINTEL; + clEnqueueMemcpyINTEL_fn clEnqueueMemcpyINTEL; + clGetMemAllocInfoINTEL_fn clGetMemAllocInfoINTEL; + clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL; + + explicit Ext(cl_device_id device, cl_context context, + clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL, + clSharedMemAllocINTEL_fn clSharedMemAllocINTEL, + clMemFreeINTEL_fn clMemFreeINTEL, + clEnqueueMemcpyINTEL_fn clEnqueueMemcpyINTEL, + clGetMemAllocInfoINTEL_fn clGetMemAllocInfoINTEL, + clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL) + : OclDevCtxPair(device, context), + clDeviceMemAllocINTEL(clDeviceMemAllocINTEL), + clSharedMemAllocINTEL(clSharedMemAllocINTEL), + clMemFreeINTEL(clMemFreeINTEL), + clEnqueueMemcpyINTEL(clEnqueueMemcpyINTEL), + clGetMemAllocInfoINTEL(clGetMemAllocInfoINTEL), + clSetKernelArgMemPointerINTEL(clSetKernelArgMemPointerINTEL) {} + + static llvm::Expected get(cl_device_id device, + cl_context context) { + static std::shared_mutex mux; + static std::unordered_map cache; + + OclDevCtxPair pair{device, context}; + { + std::shared_lock lock(mux); + if (auto it = cache.find(pair); it != cache.end()) { + return it->second; + } + } + + cl_platform_id platform; + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), + &platform, nullptr), + "Failed to get the device platform."); + +#define FIND_FUNC(name) \ + auto name = reinterpret_cast( \ + clGetExtensionFunctionAddressForPlatform(platform, #name)); \ + CHECK(name, "Failed to get the " #name " function address.") + + FIND_FUNC(clDeviceMemAllocINTEL); + FIND_FUNC(clSharedMemAllocINTEL); + FIND_FUNC(clMemFreeINTEL); + FIND_FUNC(clEnqueueMemcpyINTEL); + FIND_FUNC(clGetMemAllocInfoINTEL); + FIND_FUNC(clSetKernelArgMemPointerINTEL); + + std::lock_guard lock(mux); + if (auto it = cache.find(pair); it != cache.end()) { + return it->second; + } + auto ext = + new Ext(device, context, clDeviceMemAllocINTEL, clSharedMemAllocINTEL, + clMemFreeINTEL, clEnqueueMemcpyINTEL, clGetMemAllocInfoINTEL, + clSetKernelArgMemPointerINTEL); + cache.emplace(pair, ext); + return ext; + } +}; + +struct Kernel { + cl_program program; + cl_kernel kernel; + const size_t globalSize[3]; + const size_t localSize[3]; + const SmallVector argSize; + + explicit Kernel(cl_program program, cl_kernel kernel, const size_t *gridSize, + const size_t *blockSize, size_t argNum, const size_t *argSize) + : program(program), + kernel(kernel), globalSize{gridSize[0] * blockSize[0], + gridSize[1] * blockSize[1], + gridSize[2] * blockSize[2]}, + localSize{blockSize[0], blockSize[1], blockSize[2]}, + argSize(argSize, argSize + argNum) { +#ifndef NDEBUG + std::string args; + for (size_t i = 0; i < argNum; i++) { + args += std::to_string(argSize[i]); + if (i < argNum - 1) { + args += ", "; + } + } + gcLogD("Kernel ", kernel, " params: globalSize=[", globalSize[0], ", ", + globalSize[1], ", ", globalSize[2], "], localSize=[", localSize[0], + ", ", localSize[1], ", ", localSize[2], "], argSize=[", args.c_str(), + "]"); +#endif + } + + ~Kernel() { + CL_CHECKR(clReleaseKernel(kernel), "Failed to release OpenCL kernel."); + gcLogD("Released OpenCL kernel: ", kernel); + CL_CHECKR(clReleaseProgram(program), "Failed to release OpenCL program."); + gcLogD("Released OpenCL program: ", program); + } +}; + +// Functions exported to the ExecutionEngine +struct OclRuntime::Exports { + static llvm::orc::SymbolMap symbolMap(llvm::orc::MangleAndInterner interner) { + llvm::orc::SymbolMap map; + map.reserve(8); + map.try_emplace(interner(GPU_OCL_MALLOC_DEV), + llvm::orc::ExecutorAddr::fromPtr(&allocDev), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_MALLOC_SHARED), + llvm::orc::ExecutorAddr::fromPtr(&allocShared), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_DEALLOC), + llvm::orc::ExecutorAddr::fromPtr(&dealloc), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_MEMCPY), + llvm::orc::ExecutorAddr::fromPtr(&memcpy), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_KERNEL_CREATE), + llvm::orc::ExecutorAddr::fromPtr(&kernelCreate), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_KERNEL_DESTROY), + llvm::orc::ExecutorAddr::fromPtr(&kernelDestroy), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_KERNEL_LAUNCH), + llvm::orc::ExecutorAddr::fromPtr(&kernelLaunch), + llvm::JITSymbolFlags::Exported); + map.try_emplace(interner(GPU_OCL_FINISH), + llvm::orc::ExecutorAddr::fromPtr(&finish), + llvm::JITSymbolFlags::Exported); + return map; + } + +private: + static void *allocDev(const OclContext *ctx, size_t size) { + return gcGetOrReport(ctx->runtime.usmAllocDev(size)); + } + + static void *allocShared(const OclContext *ctx, size_t size) { + return gcGetOrReport(ctx->runtime.usmAllocShared(size)); + } + + static void dealloc(const OclContext *ctx, const void *ptr) { + gcGetOrReport(ctx->runtime.usmFree(ptr)); + } + + static void memcpy(OclContext *ctx, const void *src, void *dst, size_t size) { + gcGetOrReport(ctx->runtime.usmCpy(*ctx, src, dst, size)); + } + + static Kernel *kernelCreate(const OclContext *ctx, size_t spirvLen, + const unsigned char *spirv, const char *name, + const size_t *gridSize, const size_t *blockSize, + size_t argNum, const size_t *argSize) { + cl_int err; + auto program = + clCreateProgramWithIL(ctx->runtime.ext.context, spirv, spirvLen, &err); + CL_CHECKR(err, "Failed to create OpenCL program with IL."); + + gcLogD("Created new OpenCL program: ", program); + clBuildProgram(program, 1, &ctx->runtime.ext.device, nullptr, nullptr, + nullptr); + CL_CHECKR(err, "Failed to build the program: ", program); + gcLogD("The program has been built: ", program); + + auto kernel = clCreateKernel(program, name, &err); + CL_CHECKR(err, "Failed to create OpenCL kernel from program: ", program); + gcLogD("Created new OpenCL kernel ", kernel, " from program ", program); + + cl_bool enable = CL_TRUE; + err = clSetKernelExecInfo(kernel, + CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, + sizeof(enable), &enable); + CL_CHECKR(err, "Failed to set indirect host access."); + err = clSetKernelExecInfo(kernel, + CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, + sizeof(enable), &enable); + CL_CHECKR(err, "Failed to set indirect device access."); + err = clSetKernelExecInfo(kernel, + CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, + sizeof(enable), &enable); + CL_CHECKR(err, "Failed to set indirect shared access."); + + return new Kernel(program, kernel, gridSize, blockSize, argNum, argSize); + } + + static void kernelDestroy(size_t count, Kernel **kernels) { + gcLogD("Destroying kernels."); + for (size_t i = 0; i < count; i++) { + if (kernels[i]) { + delete kernels[i]; + } + } + } + + static void kernelLaunch(OclContext *ctx, Kernel *kernel, ...) { + struct ClonedKernel { + cl_kernel kernel; + + explicit ClonedKernel(cl_kernel kernel) : kernel(kernel) {} + + ~ClonedKernel() { + gcLogD("Releasing cloned OpenCL kernel: ", kernel); + CL_CHECKR(clReleaseKernel(kernel), + "Failed to release the kernel: ", kernel); + } + }; + + gcLogD("Launching kernel: ", kernel->kernel); + + cl_int err; + ClonedKernel cloned{clCloneKernel(kernel->kernel, &err)}; + CL_CHECKR(err, "Failed to clone OpenCL kernel: ", kernel->kernel); + gcLogD("Cloned OpenCL kernel ", kernel->kernel, ": ", cloned.kernel); + + va_list args; + va_start(args, kernel); + for (size_t i = 0, n = kernel->argSize.size(); i < n; i++) { + auto size = kernel->argSize[i]; + void *ptr = va_arg(args, void *); + + if (size) { + gcLogD("Setting kernel ", cloned.kernel, " argument ", i, " to ", + *static_cast(ptr)); + err = clSetKernelArg(cloned.kernel, i, size, ptr); + } else if (ctx->clPtrs->find(ptr) == ctx->clPtrs->end()) { + gcLogD("Setting kernel ", cloned.kernel, " argument ", i, + " to USM pointer ", ptr); + err = ctx->runtime.ext.clSetKernelArgMemPointerINTEL(cloned.kernel, i, + ptr); + } else { + gcLogD("Setting kernel ", cloned.kernel, " argument ", i, + " to CL pointer ", ptr); + err = clSetKernelArg(cloned.kernel, i, sizeof(cl_mem), &ptr); + } + + CL_CHECKR(err, "Failed to set kernel ", cloned.kernel, " argument ", i, + " of size ", size); + } + va_end(args); + + if (ctx->preserveOrder) { + cl_event event = nullptr; + err = clEnqueueNDRangeKernel(ctx->queue, cloned.kernel, 3, nullptr, + kernel->globalSize, kernel->localSize, + ctx->waitListLen, ctx->waitList, &event); + ctx->setLastEvent(event); + } else { + err = clEnqueueNDRangeKernel(ctx->queue, cloned.kernel, 3, nullptr, + kernel->globalSize, kernel->localSize, 0, + nullptr, nullptr); + } + + CL_CHECKR(err, "Failed to enqueue kernel execution: ", cloned.kernel); + gcLogD("Enqueued kernel execution: ", cloned.kernel); + } + + static void finish(OclContext *ctx) { gcGetOrReport(ctx->finish()); } +}; + +OclRuntime::OclRuntime(const Ext &ext) : ext(ext) {} + +llvm::Expected> +OclRuntime::gcIntelDevices(size_t max) { + SmallVector intelDevices; + if (max == 0) { + return intelDevices; + } + + cl_uint numPlatforms; + CL_CHECK(clGetPlatformIDs(0, nullptr, &numPlatforms), + "Failed to get the number of platforms."); + + if (numPlatforms == 0) { + gcLogD("No platforms found."); + return intelDevices; + } + + SmallVector platforms(numPlatforms); + auto err = clGetPlatformIDs(numPlatforms, platforms.data(), nullptr); + if (err != CL_SUCCESS) { + gcLogE("Failed to get the platform ids. Error: ", err); + return intelDevices; + } + + for (auto platform : platforms) { + cl_uint numDevices; + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, nullptr, &numDevices); + if (err != CL_SUCCESS) { + gcLogE("Failed to get the number of devices on the platform.", platform, + " Error: ", err); + continue; + } + if (numDevices == 0) { + continue; + } + + SmallVector devices(numDevices); + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, + devices.data(), nullptr); + if (err != CL_SUCCESS) { + gcLogE("Failed to get the device ids on the platform ", platform, + ". Error: ", err); + continue; + } + + for (auto dev : devices) { + cl_uint vendorId; + err = clGetDeviceInfo(dev, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), + &vendorId, nullptr); + if (err != CL_SUCCESS) { + gcLogE("Failed to get info about the device ", dev, ". Error: ", err); + continue; + } + if (vendorId == 0x8086) { + intelDevices.emplace_back(dev); +#ifndef NDEBUG + size_t nameSize; + std::string name; + clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, nullptr, &nameSize); + name.resize(nameSize); + clGetDeviceInfo(dev, CL_DEVICE_NAME, nameSize, &name[0], nullptr); + gcLogD("[ INFO ] GPU device ", name.c_str(), " id: ", dev); +#endif + if (intelDevices.size() == max) { + return intelDevices; + } + } + } + } + + return intelDevices; +} + +llvm::Expected OclRuntime::get() { + static OclRuntime *defaultRuntimePtr = nullptr; + if (OclRuntime *rt = defaultRuntimePtr) { + return *rt; + } + + auto devices = gcIntelDevices(1); + CHECKE(devices, "Failed to get Intel GPU devices."); + if (devices->empty()) { + return gcMakeErr("No Intel GPU devices found."); + } + + auto rt = get(devices.get()[0]); + CHECKE(rt, "Failed to create OclRuntime."); + + static OclRuntime defaultRuntime = rt.get(); + defaultRuntimePtr = &defaultRuntime; + return defaultRuntime; +} + +llvm::Expected OclRuntime::get(cl_device_id device) { + static std::shared_mutex mux; + static std::unordered_map cache; + cl_context context = nullptr; + + { + std::shared_lock lock(mux); + if (auto it = cache.find(device); it != cache.end()) { + context = it->second; + } + } + + if (context) { + return get(device, context); + } + + cl_int err; + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); + CL_CHECK(err, "Failed to create OpenCL context."); + gcLogD("Created new OpenCL context: ", context); + + { + std::lock_guard lock(mux); + if (auto it = cache.find(device); it != cache.end()) { + if (clReleaseContext(context) != CL_SUCCESS) { + gcLogE("Failed to release OpenCL context: ", context); + } else { + gcLogD("Released OpenCL context: ", context); + } + context = it->second; + } else { + cache.emplace(device, context); + } + } + + return get(device, context); +} + +llvm::Expected OclRuntime::get(cl_command_queue queue) { + cl_device_id device; + cl_context context; + CL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), + &device, nullptr), + "Failed to get CL_QUEUE_DEVICE."); + CL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), + &context, nullptr), + "Failed to get CL_QUEUE_CONTEXT."); + assert(device); + assert(context); + return get(device, context); +} + +llvm::Expected OclRuntime::get(cl_device_id device, + cl_context context) { + auto ext = Ext::get(device, context); + CHECKE(ext, "Failed to create OclRuntime::Ext."); + return OclRuntime{*ext.get()}; +} + +bool OclRuntime::isOutOfOrder(cl_command_queue queue) { + cl_command_queue_properties properties; + cl_int err = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, + sizeof(cl_command_queue_properties), + &properties, nullptr); + if (err != CL_SUCCESS) { + gcLogE("clGetCommandQueueInfo() failed with error code ", err); + // Enforcing out-of-order execution mode + return true; + } + return properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; +} + +cl_context OclRuntime::getContext() const { return ext.context; } + +cl_device_id OclRuntime::getDevice() const { return ext.device; } + +llvm::Expected +OclRuntime::createQueue(bool outOfOrder) const { + cl_int err; + cl_command_queue queue; +#ifdef CL_VERSION_2_0 + cl_queue_properties properties[] = { + CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + static_cast(outOfOrder ? 1 : 0)}; + queue = clCreateCommandQueueWithProperties(ext.context, ext.device, + properties, &err); +#else + const cl_command_queue_properties properties = + outOfOrder ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0; + queue = clCreateCommandQueue(context, device, properties, &err); +#endif + CL_CHECK(err, "Failed to create ", outOfOrder ? "out-of-order " : "", + "OpenCL command queue."); + gcLogD("Created new ", outOfOrder ? "out-of-order " : "", + "OpenCL command queue: ", queue); + return queue; +} + +llvm::Expected OclRuntime::releaseQueue(cl_command_queue queue) { + CL_CHECK(clReleaseCommandQueue(queue), + "Failed to release OpenCL command queue: ", queue); + gcLogD("Released OpenCL command queue: ", queue); + return true; +} + +llvm::Expected OclRuntime::usmAllocDev(size_t size) const { + cl_int err; + void *ptr = ext.clDeviceMemAllocINTEL(ext.context, ext.device, nullptr, size, + 0, &err); + CL_CHECK(err, "Failed to allocate ", size, " bytes of device USM memory."); + gcLogD("Allocated ", size, " bytes of device USM memory: ", ptr); + return ptr; +} + +llvm::Expected OclRuntime::usmAllocShared(size_t size) const { + cl_int err; + void *ptr = ext.clSharedMemAllocINTEL(ext.context, ext.device, nullptr, size, + 0, &err); + CL_CHECK(err, "Failed to allocate ", size, " bytes of shared USM memory."); + gcLogD("Allocated ", size, " bytes of shared USM memory: ", ptr); + return ptr; +} + +llvm::Expected OclRuntime::usmFree(const void *ptr) const { + CL_CHECK(ext.clMemFreeINTEL(ext.context, const_cast(ptr)), + "Failed to free USM memory: ", ptr); + gcLogD("Deallocated USM memory: ", ptr); + return true; +} + +llvm::Expected OclRuntime::usmCpy(OclContext &ctx, const void *src, + void *dst, size_t size) const { + cl_int err; + if (ctx.preserveOrder) { + cl_event event; + err = ext.clEnqueueMemcpyINTEL(ctx.queue, false, dst, src, size, + ctx.waitListLen, ctx.waitList, &event); + ctx.setLastEvent(event); + } else { + err = ext.clEnqueueMemcpyINTEL(ctx.queue, false, dst, src, size, 0, nullptr, + nullptr); + } + CL_CHECK(err, "Failed to copy ", size, " bytes from ", src, " to ", dst); + gcLogD("Enqueued USM memory copy of ", size, " bytes from ", src, " to ", + dst); + return true; +} + +bool OclRuntime::isUsm(const void *ptr) const { + cl_mem_info_intel allocType; + auto err = ext.clGetMemAllocInfoINTEL( + ext.context, ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_mem_info_intel), + &allocType, nullptr); + return err == CL_SUCCESS && allocType != CL_MEM_TYPE_UNKNOWN_INTEL; +} + +#ifndef NDEBUG +void OclRuntime::debug(const char *file, int line, const char *msg) { +#ifndef GC_LOG_NO_DEBUG + mlir::gc::log::insetLog(file, line, std::cout, "DEBUG", msg); +#endif +} +#endif + +llvm::Expected OclContext::finish() { + gcLogD("Waiting for the enqueued OpenCL commands to finish: ", queue); + CL_CHECK(clFinish(queue), + "Failed to finish the OpenCL command queue: ", queue); + if (preserveOrder) { + waitListLen = 0; + waitList = nullptr; + lastEvent = nullptr; + } + return true; +} + +OclModule::~OclModule() { + assert(engine); + auto fn = engine->lookup(GPU_OCL_MOD_DESTRUCTOR); + if (fn) { + reinterpret_cast(fn.get())(); + } else { + gcLogE("Module function ", GPU_OCL_MOD_DESTRUCTOR, " not found!"); + } +} + +// If all arguments of 'origFunc' are memrefs with static shape, create a new +// function called gcGpuOclStaticMain, that accepts 2 arguments: a pointer to +// OclContext and a pointer to an array, containing pointers to aligned memory +// buffers. The function will call the original function with the context, +// buffers and the offset/shape/strides, statically created from the +// memref descriptor. +StringRef createStaticMain(ModuleOp &module, const StringRef &funcName, + const ArrayRef argTypes) { + auto mainFunc = module.lookupSymbol(funcName); + if (!mainFunc) { + gcReportErr("The function '", funcName.begin(), "' not found."); + } + + // Check that the last 3 args are added by AddContextArg + auto mainArgTypes = mainFunc.getArgumentTypes(); + auto nargs = mainArgTypes.size(); + if (nargs < 3) { + gcReportErr("The function '", funcName.begin(), + "' must have an least 3 arguments."); + } + + auto ctx = module.getContext(); + ctx->getOrLoadDialect(); + OpBuilder builder(ctx); + auto i64Type = builder.getI64Type(); + auto ptrType = LLVM::LLVMPointerType::get(ctx); + + if (mainArgTypes[nargs - 3] != ptrType || + mainArgTypes[nargs - 2] != ptrType || + mainArgTypes[nargs - 1] != i64Type) { + gcReportErr("The last 3 arguments of the function '", funcName.begin(), + "' must be of type (!llvm.ptr, !llvm.ptr, i64)."); + } + + // argTypes contains only the original arguments, before lowering + nargs = argTypes.size(); + if (nargs == 0) { + // This is a no-arg function with the context param added by AddContextArg + return funcName; + } + + mainFunc.setAlwaysInline(true); + SmallVector constArgs; + unsigned argsCounter = 0; + + for (unsigned i = 0; i < nargs; ++i) { + if (auto type = mlir::dyn_cast(argTypes[i])) { + if (!type.hasStaticShape()) { + gcLogD("The argument ", i, " of the function ", funcName.begin(), + " has a dynamic shape."); + return {}; + } + + auto shape = type.getShape(); + auto offsetPtr = constArgs.end(); + constArgs.emplace_back(0); + constArgs.append(shape.begin(), shape.end()); + if (failed(getStridesAndOffset(type, constArgs, *offsetPtr))) { + gcLogD("Failed to get strides and offset of arg", i, + " of the function ", funcName.begin()); + return {}; + } + argsCounter += shape.size() * 2 + 3; + } else { + gcLogD("The argument ", i, " of the function ", funcName.begin(), + " is not of type MemRefType."); + return {}; + } + } + + auto loc = mainFunc.getLoc(); + auto newFuncType = LLVM::LLVMFunctionType::get( + mainFunc.getNumResults() ? mainFunc->getResult(0).getType() + : LLVM::LLVMVoidType::get(ctx), + {ptrType, ptrType}); + auto newFunc = + OpBuilder::atBlockEnd(module.getBody()) + .create(loc, "gcGpuOclStaticMain", newFuncType); + auto &entryBlock = *newFunc.addEntryBlock(builder); + builder.setInsertionPointToStart(&entryBlock); + Value arrayPtr = entryBlock.getArgument(1); + + std::unordered_map constMap; + auto createConst = [&](int64_t i) { + if (auto v = constMap.find(i); v != constMap.end()) { + return v->second; + } + return constMap + .emplace(i, builder.create( + loc, i64Type, builder.getIntegerAttr(i64Type, i))) + .first->second; + }; + Value zero = createConst(0); + Value one = nargs ? createConst(1) : Value{}; + SmallVector args; + args.reserve(argsCounter); + + for (unsigned i = 0, j = 0; i < nargs; i++) { + if (i != 0) { + arrayPtr = + builder.create(loc, ptrType, ptrType, arrayPtr, one); + } + + auto ptr = builder.create(loc, ptrType, arrayPtr); + args.emplace_back(ptr); + args.emplace_back(ptr); + args.emplace_back(createConst(constArgs[j++])); + + for (unsigned k = 0, + m = 2 * mlir::cast(argTypes[i]).getShape().size(); + k < m; k++) { + args.emplace_back(createConst(constArgs[j++])); + } + } + + auto oclCtxArg = entryBlock.getArgument(0); + args.emplace_back(oclCtxArg); + args.emplace_back(oclCtxArg); + args.emplace_back(zero); + + auto call = builder.create(loc, mainFunc, args); + builder.create(loc, call.getResults()); + return newFunc.getName(); +} + +StringRef getFuncName(const OclModuleBuilderOpts &opts, ModuleOp &mod) { + if (!opts.funcName.empty()) { + return opts.funcName; + } + for (auto &op : mod.getBody()->getOperations()) { + if (auto fn = dyn_cast(op); + fn && !fn.isExternal() && fn.isPublic()) { + return fn.getName(); + } + if (auto fn = dyn_cast(op); + fn && !fn.isExternal() && fn.isPublic()) { + return fn.getName(); + } + } + gcReportErr("Failed to find a public function in the module."); +} + +ArrayRef getArgTypes(const StringRef &funcName, ModuleOp &mod) { + if (auto fn = mod.lookupSymbol(funcName)) { + return fn.getArgumentTypes(); + } + if (auto fn = mod.lookupSymbol(funcName)) { + return fn.getArgumentTypes(); + } + gcReportErr("Failed to find the function '", funcName.begin(), + "' in the module."); +} + +OclModuleBuilder::OclModuleBuilder(ModuleOp module, + const OclModuleBuilderOpts &opts) + : mlirModule(module), enableObjectDump(opts.enableObjectDump), + sharedLibPaths(opts.sharedLibPaths), + pipeline(opts.pipeline + ? opts.pipeline + : [](OpPassManager &pm) { populateGPUPipeline(pm, {}); }), + funcName(getFuncName(opts, mlirModule)), + argTypes(getArgTypes(funcName, mlirModule)) {} + +llvm::Expected> +OclModuleBuilder::build(const OclRuntime &runtime) { + { + std::shared_lock lock(mux); + if (auto it = cache.find(runtime.ext); it != cache.end()) { + return it->second; + } + } + return build(runtime.ext); +} + +llvm::Expected> +OclModuleBuilder::build(cl_command_queue queue) { + auto rt = OclRuntime::get(queue); + CHECKE(rt, "Failed to create OclRuntime."); + return build(rt.get()); +} + +llvm::Expected> +OclModuleBuilder::build(cl_device_id device, cl_context context) { + { + OclDevCtxPair pair{device, context}; + std::shared_lock lock(mux); + if (auto it = cache.find(pair); it != cache.end()) { + return it->second; + } + } + + auto ext = OclRuntime::Ext::get(device, context); + CHECKE(ext, "Failed to create OclRuntime::Ext."); + return build(*ext.get()); +} + +llvm::Expected> +OclModuleBuilder::build(const OclRuntime::Ext &ext) { + auto mod = mlirModule.clone(); + PassManager pm{mod.getContext()}; + pipeline(pm); + CHECK(!pm.run(mod).failed(), "GPU pipeline failed!"); + + auto staticMain = createStaticMain(mod, funcName, argTypes); + + ExecutionEngineOptions opts; + opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive; + opts.enableObjectDump = enableObjectDump; +#ifdef NDEBUG + opts.enableGDBNotificationListener = false; + opts.enablePerfNotificationListener = false; +#endif + + auto eng = ExecutionEngine::create(mod, opts); + CHECKE(eng, "Failed to create ExecutionEngine!"); + eng->get()->registerSymbols(OclRuntime::Exports::symbolMap); + + OclModule::MainFunc main = {nullptr}; + + if (staticMain.empty()) { + auto expect = eng.get()->lookupPacked(funcName); + CHECKE(expect, "Packed function '", funcName.begin(), "' not found!"); + main.wrappedMain = *expect; + } else { + auto expect = eng.get()->lookup(staticMain); + CHECKE(expect, "Compiled function '", staticMain.begin(), "' not found!"); + main.staticMain = reinterpret_cast(*expect); + } + + std::lock_guard lock(mux); + if (auto it = cache.find(ext); it != cache.end()) { + return it->second; + } + std::shared_ptr ptr( + new OclModule(OclRuntime(ext), !staticMain.empty(), main, argTypes, + std::move(eng.get()))); + return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr) + .first->second; +} +} // namespace mlir::gc::gpu \ No newline at end of file diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt index 3909681e3..47d3899bb 100644 --- a/lib/gc/Transforms/GPU/CMakeLists.txt +++ b/lib/gc/Transforms/GPU/CMakeLists.txt @@ -1,3 +1,15 @@ +include(imex) +get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES) +get_property(IMEX_DIALECT_LIBS GLOBAL PROPERTY IMEX_DIALECT_LIBS) +get_property(IMEX_CONVERSION_LIBS GLOBAL PROPERTY IMEX_CONVERSION_LIBS) +set(IMEX_LIBS + ${IMEX_CONVERSION_LIBS} + IMEXUtil + IMEXTransforms + ${IMEX_DIALECT_LIBS} +) +set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS}) + gc_add_mlir_library(GcGpuPasses AddContextArg.cpp GpuToGpuOcl.cpp @@ -19,9 +31,10 @@ gc_add_mlir_library(GcGpuPasses MLIRMemRefTransforms GcInterface GcUtilsIR + ${IMEX_LIBS} ) -include(imex) -get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES) -target_include_directories(GcGpuPasses PRIVATE ${IMEX_INCLUDES}) - +foreach (include_dir IN LISTS IMEX_INCLUDES) + list(APPEND IMEX_BUILD_INCLUDES $) +endforeach () +target_include_directories(GcGpuPasses PUBLIC ${IMEX_BUILD_INCLUDES}) diff --git a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp index f765d29cf..3bad9fc0f 100644 --- a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp +++ b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp @@ -124,11 +124,13 @@ struct ConvertAlloc final : ConvertOpPattern { LogicalResult matchAndRewrite(gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + auto fnName = + allocOp.getHostShared() ? GPU_OCL_MALLOC_SHARED : GPU_OCL_MALLOC_DEV; auto loc = allocOp.getLoc(); MemRefType type = allocOp.getType(); if (auto staticSize = helper.calculateStaticSize(rewriter, loc, type)) { - auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType, + auto ptr = funcCall(rewriter, fnName, helper.ptrType, {helper.ptrType, helper.idxType}, loc, {getCtxPtr(rewriter), staticSize}) .getResult(); @@ -151,7 +153,7 @@ struct ConvertAlloc final : ConvertOpPattern { shape, strides, size); assert(shape.size() == strides.size()); - auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType, + auto ptr = funcCall(rewriter, fnName, helper.ptrType, {helper.ptrType, helper.idxType}, loc, {getCtxPtr(rewriter), size}) .getResult(); @@ -224,7 +226,8 @@ struct ConvertMemcpy final : ConvertOpPattern { struct ConvertLaunch final : ConvertOpPattern { - explicit ConvertLaunch(const Helper &helper) : ConvertOpPattern(helper) {} + explicit ConvertLaunch(const Helper &helper, bool callFinish) + : ConvertOpPattern(helper), callFinish(callFinish) {} LogicalResult matchAndRewrite(gpu::LaunchFuncOp gpuLaunch, OpAdaptor adaptor, @@ -261,10 +264,17 @@ struct ConvertLaunch final : ConvertOpPattern { funcCall(rewriter, GPU_OCL_KERNEL_LAUNCH, helper.voidType, {helper.ptrType, helper.ptrType}, loc, args, true); rewriter.replaceOp(gpuLaunch, gpuOclLaunch); + + if (callFinish) { + funcCall(rewriter, GPU_OCL_FINISH, helper.voidType, {helper.ptrType}, loc, + getCtxPtr(rewriter)); + } + return success(); } private: + bool callFinish; // Returns the kernel pointer stored in the global var ...name_Ptr. // If it's NULL, calls the createKernel() function. std::optional getKernel(gpu::LaunchFuncOp &gpuLaunch, @@ -377,8 +387,9 @@ struct ConvertLaunch final : ConvertOpPattern { auto ptr = mod.lookupSymbol(str("Ptr")); assert(ptr); - SmallVector nameChars(kernelModName.getValue().begin(), - kernelModName.getValue().end()); + auto kernelName = gpuLaunch.getKernelName(); + SmallVector nameChars(kernelName.getValue().begin(), + kernelName.getValue().end()); nameChars.emplace_back('\0'); // Kernel name and SPIRV are stored as global strings auto name = LLVM::createGlobalString( @@ -495,6 +506,10 @@ struct ConvertLaunch final : ConvertOpPattern { struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase { + explicit GpuToGpuOcl() : GpuToGpuOcl(gc::GpuToGpuOclOptions{}) {} + explicit GpuToGpuOcl(const gc::GpuToGpuOclOptions &opts) + : GpuToGpuOclBase(opts) {} + void runOnOperation() override { const auto ctx = &getContext(); const LLVMConversionTarget target(getContext()); @@ -503,8 +518,9 @@ struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase { RewritePatternSet patterns(ctx); populateGpuToLLVMConversionPatterns(converter, patterns); - patterns.insert( - helper); + patterns.insert(helper); + patterns.insert(helper, callFinish); + patterns.insert(helper); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) { diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp index d7bc69e13..c1c3735aa 100644 --- a/lib/gc/Transforms/GPU/Pipeline.cpp +++ b/lib/gc/Transforms/GPU/Pipeline.cpp @@ -6,49 +6,33 @@ // //===----------------------------------------------------------------------===// +#include + +#include "gc/Transforms/Passes.h" + +#include "imex/Conversion/Passes.h" +#include "imex/Transforms/Passes.h" + #include "mlir/Conversion/Passes.h" -#include "mlir/Dialect/Arith/Transforms/Passes.h" #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" #include "mlir/Dialect/Bufferization/Transforms/Passes.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/Transforms/Passes.h" #include "mlir/Dialect/Linalg/Passes.h" -#include "mlir/Dialect/Math/Transforms/Passes.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/MemRef/Transforms/Passes.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/DialectRegistry.h" +#include "mlir/Dialect/SPIRV/Transforms/Passes.h" #include "mlir/InitAllPasses.h" #include "mlir/Pass/PassManager.h" -#include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/Passes.h" -#include - -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/GPU/Transforms/Passes.h" -#include "mlir/Dialect/SPIRV/Transforms/Passes.h" - -#include -#include - -#include - -#include "gc/Transforms/Passes.h" namespace mlir::gc { -struct GPUPipelineOption : PassPipelineOptions { - PassOptions::Option isUsmArgs{ - *this, "is-usm-args", - llvm::cl::desc("Whether to use USM(unified shared memory) func args, in " - "which the host and device could access the same buffer " - "and there is no need to add memcpy explicitly"), - llvm::cl::init(true)}; -}; - -void populateGPUPipeline(mlir::OpPassManager &pm, +void populateGPUPipeline(OpPassManager &pm, const GPUPipelineOption &pipelineOption) { + // Add an argument for the GPU context + pm.addNestedPass(createAddContextArg()); + pm.addNestedPass(createIterativeTilingAndFusion()); pm.addPass(bufferization::createEmptyTensorEliminationPass()); @@ -91,6 +75,7 @@ void populateGPUPipeline(mlir::OpPassManager &pm, /*isUsmArgs*/ pipelineOption.isUsmArgs.getValue()}; pm.addNestedPass( imex::createInsertGPUAllocsPass(insertGPUAllocsOption)); + pm.addPass(createGpuKernelOutliningPass()); pm.addPass(createCanonicalizerPass()); pm.addPass(imex::createSetSPIRVCapabilitiesPass()); @@ -109,7 +94,6 @@ void populateGPUPipeline(mlir::OpPassManager &pm, pm.addNestedPass(LLVM::createRequestCWrappersPass()); pm.addPass(imex::createSerializeSPIRVPass()); pm.addPass(createConvertVectorToSCFPass()); - pm.addPass(imex::createConvertGPUToGPUXPass()); pm.addPass(createConvertSCFToCFPass()); pm.addPass(createConvertControlFlowToLLVMPass()); pm.addPass(createConvertVectorToLLVMPass()); @@ -117,7 +101,7 @@ void populateGPUPipeline(mlir::OpPassManager &pm, pm.addPass(createArithToLLVMConversionPass()); pm.addPass(createConvertFuncToLLVMPass()); pm.addPass(createConvertMathToLLVMPass()); - pm.addPass(imex::createConvertGPUXToLLVMPass()); + pm.addPass(createGpuToGpuOcl({pipelineOption.callFinish})); pm.addPass(createConvertIndexToLLVMPass()); pm.addPass(memref::createExpandStridedMetadataPass()); pm.addPass(createLowerAffinePass()); diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index 44de22b32..d6a9b6560 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -49,7 +49,6 @@ target_link_libraries(gc-opt PRIVATE if(GC_ENABLE_IMEX) include(imex) - target_compile_options(gc-opt PRIVATE -DGC_USE_IMEX) get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES) target_include_directories(gc-opt PRIVATE ${IMEX_INCLUDES}) target_link_libraries(gc-opt PRIVATE diff --git a/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir b/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir index 8a5571571..0c3998b5d 100644 --- a/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir +++ b/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir @@ -68,7 +68,7 @@ module @test attributes {gpu.container_module} { // CHECK: llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, [[CTX:%.+]]: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64) // CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64 -// CHECK: llvm.call @gcGpuOclMalloc([[CTX]], [[SIZE]]) +// CHECK: llvm.call @gcGpuOclMallocShared([[CTX]], [[SIZE]]) // CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64 // CHECK: [[SRC:%.+]] = llvm.extractvalue // CHECK: [[DST:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1] diff --git a/test/mlir/test/gc/gpu-runner/lit.local.cfg b/test/mlir/test/gc/gpu-runner/lit.local.cfg index f180dd41b..6dce3e36e 100644 --- a/test/mlir/test/gc/gpu-runner/lit.local.cfg +++ b/test/mlir/test/gc/gpu-runner/lit.local.cfg @@ -1,2 +1,4 @@ if not config.gc_use_imex: - config.unsupported = True \ No newline at end of file + config.unsupported = True +else: # FIXME: Remove this when the GPU runner is implemented. + config.unsupported = True diff --git a/test/mlir/unittests/ExecutionEngine/CMakeLists.txt b/test/mlir/unittests/ExecutionEngine/CMakeLists.txt index 9f5747b58..9c28d75d4 100644 --- a/test/mlir/unittests/ExecutionEngine/CMakeLists.txt +++ b/test/mlir/unittests/ExecutionEngine/CMakeLists.txt @@ -9,5 +9,9 @@ target_include_directories(GCExecutionEngineTests PRIVATE ${GC_DNNL_INCLUDES}) target_link_libraries(GCExecutionEngineTests PRIVATE GcJitWrapper - GcCpuRuntime) + GcCpuRuntime +) +if(GC_ENABLE_IMEX) + add_subdirectory(GPU) +endif() diff --git a/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt b/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt new file mode 100644 index 000000000..1aefee321 --- /dev/null +++ b/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt @@ -0,0 +1,8 @@ +add_mlir_unittest(GCExecutionEngineGpuTests + GpuOclRuntimeTest.cpp +) +target_link_libraries(GCExecutionEngineGpuTests + PRIVATE + GcJitWrapper + GcGpuOclRuntime +) diff --git a/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp b/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp new file mode 100644 index 000000000..9d7b18bca --- /dev/null +++ b/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp @@ -0,0 +1,232 @@ +//===-- GpuOclRuntime.cpp - -------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "gc/ExecutionEngine/Driver/Driver.h" +#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h" +#include "gc/Utils/Error.h" + +#include "mlir/ExecutionEngine/MemRefUtils.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/InitAllDialects.h" +#include "mlir/InitAllPasses.h" +#include "mlir/Parser/Parser.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" + +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" + +#include "gtest/gtest.h" + +using namespace mlir; +using namespace gc::gpu; + +constexpr char addStatic[] = R"mlir( +module @test { + func.func @entry(%arg0: memref<64x64xf32>, %arg1: memref<64x64xf32>, %arg2: memref<64x64xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<64x64xf32> + %1 = bufferization.to_tensor %arg1 restrict : memref<64x64xf32> + %2 = tensor.empty() : tensor<64x64xf32> + %3 = linalg.add ins(%1, %0 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2 : tensor<64x64xf32>) -> tensor<64x64xf32> + bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<64x64xf32>, memref<64x64xf32>) -> () + return + } +} +)mlir"; + +constexpr char addDynamic[] = R"mlir( +module @test { + func.func @entry(%arg0: memref, %arg1: memref, %arg2: memref) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = bufferization.to_tensor %arg0 restrict : memref + %1 = bufferization.to_tensor %arg1 restrict : memref + %d0 = memref.dim %arg0, %c0 : memref + %d1 = memref.dim %arg0, %c1 : memref + %2 = tensor.empty(%d0, %d1) : tensor + %3 = linalg.add ins(%1, %0 : tensor, tensor) outs(%2 : tensor) -> tensor + bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor, memref) -> () + return + } +} +)mlir"; + +constexpr char matmulAddStatic[] = R"mlir( +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} { + func.func @entry(%arg0: memref<64x128xf32>, %arg1: memref<128x128xf32>, %arg2: memref<64x128xf32>) { + %0 = bufferization.to_tensor %arg0 restrict : memref<64x128xf32> + %1 = bufferization.to_tensor %arg1 restrict : memref<128x128xf32> + %2 = tensor.empty() : tensor<64x128xf32> + %cst = arith.constant 0.000000e+00 : f32 + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<64x128xf32>) -> tensor<64x128xf32> + %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<64x128xf32>, tensor<128x128xf32>) outs(%3 : tensor<64x128xf32>) -> tensor<64x128xf32> + %5 = tensor.empty() : tensor<64x128xf32> + %6 = linalg.add ins(%4, %0 : tensor<64x128xf32>, tensor<64x128xf32>) outs(%5 : tensor<64x128xf32>) -> tensor<64x128xf32> + bufferization.materialize_in_destination %6 in restrict writable %arg2 : (tensor<64x128xf32>, memref<64x128xf32>) -> () + return + } +} +)mlir"; + +struct TestBase { + OclRuntime runtime = gcGetOrReport(OclRuntime::get()); + cl_command_queue queue = gcGetOrReport(runtime.createQueue()); + OclContext ctx{runtime, queue}; + MLIRContext mlirCtx{gc::initCompilerAndGetDialects()}; + + virtual void exec(std::shared_ptr &mod) = 0; + + virtual ~TestBase() { gcGetOrReport(runtime.releaseQueue(queue)); } + + OwningOpRef parse(const char *code) { + std::unique_ptr memBuf = + llvm::MemoryBuffer::getMemBuffer(code); + llvm::SourceMgr srcMgr; + srcMgr.AddNewSourceBuffer(std::move(memBuf), SMLoc()); + return parseSourceFile(srcMgr, &mlirCtx); + } +}; + +template struct TestAdd : TestBase { + static constexpr unsigned size = N * M; + float *buf0 = gcGetOrReport(runtime.usmNewDev(size)); + float *buf1 = gcGetOrReport(runtime.usmNewDev(size)); + float *buf2 = gcGetOrReport(runtime.usmNewShared(size)); + + explicit TestAdd() { + float cpuBuf[size]; + std::fill(cpuBuf, cpuBuf + size, 2.0f); + assert(runtime.usmCpy(ctx, cpuBuf, buf0, size)); + assert(runtime.usmCpy(ctx, cpuBuf, buf1, size)); + gcGetOrReport(ctx.finish()); + } + + ~TestAdd() override { + assert(runtime.usmFree(buf0)); + assert(runtime.usmFree(buf1)); + assert(runtime.usmFree(buf2)); + } + + void test(const char *code) { + OclModuleBuilder builder(parse(code)); + auto mod = gcGetOrReport(builder.build(runtime)); + exec(mod); + + float cpuBuf[size]; + assert(runtime.usmCpy(ctx, buf2, cpuBuf, size)); + gcGetOrReport(ctx.finish()); + + for (unsigned i = 0; i < size; i++) { + // std::cout << buf2[i] << " "; + assert(buf2[i] == 4.0f); + } + // std::cout << "\n"; + + for (float i : cpuBuf) { + // std::cout << i << " "; + assert(i == 4.0f); + } + } +}; + +template struct TestMatmulAdd : TestBase { + static constexpr unsigned size1 = N * M; + static constexpr unsigned size2 = M * M; + float *buf0 = gcGetOrReport(runtime.usmNewDev(size1)); + float *buf1 = gcGetOrReport(runtime.usmNewDev(size2)); + float *buf2 = gcGetOrReport(runtime.usmNewShared(size1)); + + explicit TestMatmulAdd() { + float cpuBuf[size2]; + std::fill(cpuBuf, cpuBuf + size2, 2); + assert(runtime.usmCpy(ctx, cpuBuf, buf0, size1)); + assert(runtime.usmCpy(ctx, cpuBuf, buf1, size2)); + gcGetOrReport(ctx.finish()); + } + + ~TestMatmulAdd() override { + assert(runtime.usmFree(buf0)); + assert(runtime.usmFree(buf1)); + assert(runtime.usmFree(buf2)); + } + + void test(const char *code) { + OclModuleBuilder builder(parse(code)); + auto mod = gcGetOrReport(builder.build(runtime)); + exec(mod); + + gcGetOrReport(ctx.finish()); + for (unsigned i = 0; i < size1; i++) { + // std::cout << buf2[i] << " "; + assert(buf2[i] == 514); + } + // std::cout << "\n"; + } +}; + +TEST(GpuOclRuntime, TestAddStatic) { + struct TestAddStatic1 : TestAdd<64> { + void exec(std::shared_ptr &mod) override { + assert(mod->isStatic); + StaticExecutor<3> exec(mod); + exec(ctx, buf0, buf1, buf2); + // Check if the executor is allocated on the stack + assert(exec.isSmall()); + } + } test1; + test1.test(addStatic); + + struct TestAddStatic2 : TestAdd<64> { + void exec(std::shared_ptr &mod) override { + assert(mod->isStatic); + StaticExecutor<3> exec(mod); + exec.arg(buf0); + exec.arg(buf1); + exec.arg(buf2); + // Check if the executor is allocated on the stack + assert(exec.isSmall()); + exec(ctx); + } + } test2; + test2.test(addStatic); +} + +TEST(GpuOclRuntime, TestAddDynamic) { + GTEST_SKIP() << "Dynamic shapes are not yet supported"; + struct TestAddDynamic : TestAdd<32, 64> { + void exec(std::shared_ptr &mod) override { + assert(!mod->isStatic); + int64_t shape[] = {32, 64}; + int64_t strides[] = {64, 1}; + DynamicExecutor<24> exec(mod); + exec.arg(buf0, 2, shape, strides); + exec.arg(buf1, 2, shape, strides); + exec.arg(buf2, 2, shape, strides); + exec(ctx); + // Check if the executor is allocated on the stack + assert(exec.isSmall()); + } + } test; + test.test(addDynamic); +} + +TEST(GpuOclRuntime, TestMatmulAddStatic) { + struct Test : TestMatmulAdd<64, 128> { + void exec(std::shared_ptr &mod) override { + assert(mod->isStatic); + StaticExecutor<3> exec(mod); + exec(ctx, buf0, buf1, buf2); + assert(exec.isSmall()); + } + } test; + test.test(matmulAddStatic); +}