From 3ed8730f031b18560130658adef4eec349e6837c Mon Sep 17 00:00:00 2001
From: Andrey Pavlenko <andrey.a.pavlenko@gmail.com>
Date: Tue, 10 Sep 2024 11:53:47 +0200
Subject: [PATCH] Implemented GPU OpenCL runtime

---
 .github/workflows/clang-tidy.yml              |   2 +-
 CMakeLists.txt                                |   3 +
 cmake/imex.cmake                              |   1 +
 .../GPURuntime/GpuOclRuntime.h                | 455 +++++++++-
 include/gc/Transforms/Passes.h                |  14 +-
 include/gc/Transforms/Passes.td               |   5 +
 include/gc/Utils/Error.h                      |  58 ++
 include/gc/Utils/Log.h                        |  51 ++
 lib/gc/ExecutionEngine/CMakeLists.txt         |   1 +
 .../ExecutionEngine/GPURuntime/CMakeLists.txt |   1 +
 .../GPURuntime/ocl/CMakeLists.txt             |  10 +
 .../GPURuntime/ocl/GpuOclRuntime.cpp          | 838 ++++++++++++++++++
 lib/gc/Transforms/GPU/CMakeLists.txt          |  21 +-
 lib/gc/Transforms/GPU/GpuToGpuOcl.cpp         |  30 +-
 lib/gc/Transforms/GPU/Pipeline.cpp            |  48 +-
 src/gc-opt/CMakeLists.txt                     |   1 -
 .../test/gc/Transforms/GPU/gpu-to-gpuocl.mlir |   2 +-
 test/mlir/test/gc/gpu-runner/lit.local.cfg    |   4 +-
 .../unittests/ExecutionEngine/CMakeLists.txt  |   6 +-
 .../ExecutionEngine/GPU/CMakeLists.txt        |   8 +
 .../ExecutionEngine/GPU/GpuOclRuntimeTest.cpp | 232 +++++
 21 files changed, 1740 insertions(+), 51 deletions(-)
 create mode 100644 include/gc/Utils/Error.h
 create mode 100644 include/gc/Utils/Log.h
 create mode 100644 lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt
 create mode 100644 lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt
 create mode 100644 lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
 create mode 100644 test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt
 create mode 100644 test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp

diff --git a/.github/workflows/clang-tidy.yml b/.github/workflows/clang-tidy.yml
index 59595c00a..52b445901 100644
--- a/.github/workflows/clang-tidy.yml
+++ b/.github/workflows/clang-tidy.yml
@@ -14,7 +14,7 @@ jobs:
 
     steps:
     - name: Install OpenMP
-      run: "sudo apt install -y libomp-dev"
+      run: "sudo apt install -y libomp-dev opencl-c-headers"
       
     - name: Fetch sources
       uses: actions/checkout@v4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 538076515..66af39632 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,12 +98,15 @@ get_property(GC_TOOLS GLOBAL PROPERTY GC_TOOLS)
 get_property(GC_MLIR_LIBS GLOBAL PROPERTY GC_MLIR_LIBS)
 get_property(GC_PASS_LIBS GLOBAL PROPERTY GC_PASS_LIBS)
 get_property(GC_DIALECT_LIBS GLOBAL PROPERTY GC_DIALECT_LIBS)
+get_property(IMEX_LIBS GLOBAL PROPERTY IMEX_LIBS)
+
 install(TARGETS
   GcInterface
   ${GC_TOOLS}
   ${GC_MLIR_LIBS}
   ${GC_PASS_LIBS}
   ${GC_DIALECT_LIBS}
+  ${IMEX_LIBS}
   EXPORT ${PROJECT_NAME}Targets
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/cmake/imex.cmake b/cmake/imex.cmake
index c698945c4..f36f5c391 100644
--- a/cmake/imex.cmake
+++ b/cmake/imex.cmake
@@ -24,4 +24,5 @@ if (NOT DEFINED IMEX_INCLUDES)
             ${imex_SOURCE_DIR}/src
     )
     set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
+    target_compile_options(GcInterface INTERFACE -DGC_USE_IMEX)
 endif ()
diff --git a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
index b01b9f2c6..c54f0d94e 100644
--- a/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
+++ b/include/gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h
@@ -10,19 +10,470 @@
 #define GC_GPUOCLRUNTIME_H
 
 namespace mlir::gc::gpu {
-constexpr char GPU_OCL_MALLOC[] = "gcGpuOclMalloc";
+constexpr char GPU_OCL_MALLOC_DEV[] = "gcGpuOclMallocDev";
+constexpr char GPU_OCL_MALLOC_SHARED[] = "gcGpuOclMallocShared";
 constexpr char GPU_OCL_DEALLOC[] = "gcGpuOclDealloc";
 constexpr char GPU_OCL_MEMCPY[] = "gcGpuOclMemcpy";
 constexpr char GPU_OCL_KERNEL_CREATE[] = "gcGpuOclKernelCreate";
 constexpr char GPU_OCL_KERNEL_DESTROY[] = "gcGpuOclKernelDestroy";
 constexpr char GPU_OCL_KERNEL_LAUNCH[] = "gcGpuOclKernelLaunch";
+constexpr char GPU_OCL_FINISH[] = "gcGpuOclFinish";
 constexpr char GPU_OCL_MOD_DESTRUCTOR[] = "gcGpuOclModuleDestructor";
 } // namespace mlir::gc::gpu
 
 #ifndef GC_GPU_OCL_CONST_ONLY
+#include <cstdarg>
+#include <unordered_set>
+#include <vector>
 
-// TBD
+#include <CL/cl.h>
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir::gc::gpu {
+struct OclDevCtxPair {
+  cl_device_id device;
+  cl_context context;
+  explicit OclDevCtxPair(cl_device_id device, cl_context context)
+      : device(device), context(context) {}
+
+  bool operator==(const OclDevCtxPair &other) const {
+    return device == other.device && context == other.context;
+  }
+};
+} // namespace mlir::gc::gpu
+template <> struct std::hash<const mlir::gc::gpu::OclDevCtxPair> {
+  std::size_t
+  operator()(const mlir::gc::gpu::OclDevCtxPair &pair) const noexcept {
+    return std::hash<cl_device_id>()(pair.device) ^
+           std::hash<cl_context>()(pair.context);
+  }
+}; // namespace std
+namespace mlir::gc::gpu {
+struct OclModule;
+struct OclContext;
+struct OclModuleBuilder;
+
+struct OclRuntime {
+  // Returns the available Intel GPU device ids.
+  [[nodiscard]] static llvm::Expected<SmallVector<cl_device_id, 2>>
+  gcIntelDevices(size_t max = std::numeric_limits<size_t>::max());
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get();
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get(cl_device_id device);
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get(cl_command_queue queue);
+
+  [[nodiscard]] static llvm::Expected<OclRuntime> get(cl_device_id device,
+                                                      cl_context context);
+
+  static bool isOutOfOrder(cl_command_queue queue);
+
+  [[nodiscard]] cl_context getContext() const;
+
+  [[nodiscard]] cl_device_id getDevice() const;
+
+  [[nodiscard]] llvm::Expected<cl_command_queue>
+  createQueue(bool outOfOrder = false) const;
+
+  [[nodiscard]] static llvm::Expected<bool>
+  releaseQueue(cl_command_queue queue);
+
+  [[nodiscard]] llvm::Expected<void *> usmAllocDev(size_t size) const;
+
+  [[nodiscard]] llvm::Expected<void *> usmAllocShared(size_t size) const;
+
+  [[nodiscard]] llvm::Expected<bool> usmFree(const void *ptr) const;
+
+  [[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const void *src,
+                                            void *dst, size_t size) const;
+
+  template <typename T>
+  [[nodiscard]] llvm::Expected<T *> usmNewDev(size_t size) const {
+    auto expected = usmAllocDev(size * sizeof(T));
+    if (expected) {
+      return static_cast<T *>(*expected);
+    }
+    return expected.takeError();
+  }
+
+  template <typename T>
+  [[nodiscard]] llvm::Expected<T *> usmNewShared(size_t size) const {
+    auto expected = usmAllocShared(size * sizeof(T));
+    if (expected) {
+      return static_cast<T *>(*expected);
+    }
+    return expected.takeError();
+  }
+
+  template <typename T>
+  [[nodiscard]] llvm::Expected<bool> usmCpy(OclContext &ctx, const T *src,
+                                            T *dst, size_t size) const {
+    return usmCpy(ctx, static_cast<const void *>(src), static_cast<void *>(dst),
+                  size * sizeof(T));
+  }
+
+  // Use with caution! This is safe to check validity of USM, but may be false
+  // positive for any other kinds.
+  bool isUsm(const void *ptr) const;
+
+  bool operator==(const OclRuntime &other) const {
+    return getDevice() == other.getDevice() &&
+           getContext() == other.getContext();
+  }
+
+private:
+  struct Ext;
+  struct Exports;
+  friend OclContext;
+  friend OclModuleBuilder;
+  template <unsigned N> friend struct DynamicExecutor;
+  template <unsigned N> friend struct StaticExecutor;
+  explicit OclRuntime(const Ext &ext);
+  const Ext &ext;
+
+#ifndef NDEBUG
+  static void debug(const char *file, int line, const char *msg);
+#endif
+};
+
+static constexpr int64_t ZERO = 0;
+static constexpr auto ZERO_PTR = const_cast<int64_t *>(&ZERO);
+
+// NOTE: The context is mutable and not thread-safe! It's expected to be used in
+// a single thread only.
+struct OclContext {
+  const OclRuntime &runtime;
+  const cl_command_queue queue;
+  // Preserve the execution order. This is required in case of out-of-order
+  // execution (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE). When the execution
+  // is completed, the 'lastEvent' field contains the event of the last enqueued
+  // command. If this field is false, 'waitList' is ignored.
+  const bool preserveOrder;
+  cl_uint waitListLen;
+  cl_event *waitList;
+  cl_event lastEvent;
+
+  explicit OclContext(const OclRuntime &runtime, cl_command_queue queue,
+                      cl_uint waitListLen = 0, cl_event *waitList = nullptr)
+      : OclContext(runtime, queue, OclRuntime::isOutOfOrder(queue), waitListLen,
+                   waitList) {}
+
+  explicit OclContext(const OclRuntime &runtime, cl_command_queue queue,
+                      bool preserveOrder, cl_uint waitListLen,
+                      cl_event *waitList)
+      : runtime(runtime), queue(queue), preserveOrder(preserveOrder),
+        waitListLen(preserveOrder ? waitListLen : 0),
+        waitList(preserveOrder ? waitList : nullptr), lastEvent(nullptr),
+        clPtrs(nullptr) {
+    assert(!OclRuntime::isOutOfOrder(queue) || preserveOrder);
+    assert(preserveOrder || (waitListLen == 0 && waitList == nullptr));
+  }
+
+  OclContext(const OclContext &) = delete;
+  OclContext &operator=(const OclContext &) = delete;
+
+  [[nodiscard]] llvm::Expected<bool> finish();
+
+private:
+  friend OclRuntime;
+  friend OclRuntime::Exports;
+  template <unsigned N> friend struct DynamicExecutor;
+  template <unsigned N> friend struct StaticExecutor;
+  std::unordered_set<void *> *clPtrs;
+
+  void setLastEvent(cl_event event) {
+    lastEvent = event;
+    if (event) {
+      waitListLen = 1;
+      waitList = &lastEvent;
+    } else {
+      waitListLen = 0;
+      waitList = nullptr;
+    }
+  }
+};
+
+struct OclModule {
+  const OclRuntime runtime;
+  // If all the function arguments have static shapes, then this field is true
+  // and main.staticMain is used. Otherwise, main.wrappedMain is used.
+  const bool isStatic;
+
+  ~OclModule();
+  OclModule(const OclModule &) = delete;
+  OclModule &operator=(const OclModule &) = delete;
+  OclModule(const OclModule &&) = delete;
+  OclModule &operator=(const OclModule &&) = delete;
+
+  void dumpToObjectFile(StringRef filename) const {
+    engine->dumpToObjectFile(filename);
+  }
+
+private:
+  friend OclModuleBuilder;
+  template <unsigned N> friend struct DynamicExecutor;
+  template <unsigned N> friend struct OclModuleExecutorBase;
+  template <unsigned N> friend struct StaticExecutor;
+  // This function is only created when all args are memrefs with static shape.
+  using StaticMainFunc = void (*)(OclContext *, void **);
+  // Wrapper, generated by the engine. The arguments are pointers to the values.
+  using WrappedMainFunc = void (*)(void **);
+  union MainFunc {
+    StaticMainFunc staticMain;
+    WrappedMainFunc wrappedMain;
+  };
+  const MainFunc main;
+  const ArrayRef<Type> argTypes;
+  std::unique_ptr<ExecutionEngine> engine;
+
+  explicit OclModule(const OclRuntime &runtime, const bool isStatic,
+                     const MainFunc main, const ArrayRef<Type> argTypes,
+                     std::unique_ptr<ExecutionEngine> engine)
+      : runtime(runtime), isStatic(isStatic), main(main), argTypes(argTypes),
+        engine(std::move(engine)) {}
+};
+
+struct OclModuleBuilderOpts {
+  StringRef funcName = {};
+  bool enableObjectDump = false;
+  ArrayRef<StringRef> sharedLibPaths = {};
+  void (*pipeline)(OpPassManager &) = nullptr;
+};
+
+struct OclModuleBuilder {
+  friend OclRuntime;
+  explicit OclModuleBuilder(ModuleOp module,
+                            const OclModuleBuilderOpts &opts = {});
+  explicit OclModuleBuilder(OwningOpRef<ModuleOp> &module,
+                            const OclModuleBuilderOpts &opts = {})
+      : OclModuleBuilder(module.release(), opts) {}
+  explicit OclModuleBuilder(OwningOpRef<ModuleOp> &&module,
+                            const OclModuleBuilderOpts &opts = {})
+      : OclModuleBuilder(module.release(), opts) {}
+
+  llvm::Expected<std::shared_ptr<const OclModule>>
+  build(const OclRuntime &runtime);
+
+  llvm::Expected<std::shared_ptr<const OclModule>>
+  build(cl_command_queue queue);
+
+  llvm::Expected<std::shared_ptr<const OclModule>> build(cl_device_id device,
+                                                         cl_context context);
+
+private:
+  ModuleOp mlirModule;
+  const bool enableObjectDump;
+  const ArrayRef<StringRef> sharedLibPaths;
+  void (*const pipeline)(OpPassManager &);
+  const StringRef funcName;
+  const ArrayRef<Type> argTypes;
+  std::shared_mutex mux;
+  std::unordered_map<const OclDevCtxPair, std::shared_ptr<const OclModule>>
+      cache;
+
+  llvm::Expected<std::shared_ptr<const OclModule>>
+  build(const OclRuntime::Ext &ext);
+};
+
+// NOTE: This class is mutable and not thread-safe!
+template <unsigned N> struct OclModuleExecutorBase {
+
+  void reset() {
+    args.clear();
+    clPtrs.clear();
+    argCounter = 0;
+  }
+
+  Type getArgType(unsigned idx) const { return mod->argTypes[idx]; }
+
+  [[nodiscard]] bool isSmall() const { return args.small(); }
+
+protected:
+  struct Args : SmallVector<void *, N> {
+    [[nodiscard]] bool small() const { return this->isSmall(); }
+  };
+
+  const std::shared_ptr<const OclModule> &mod;
+  // Contains the pointers of all non-USM arguments. It's expected, that the
+  // arguments are either USM or CL pointers and most probably are USM, thus,
+  // in most cases, this set will be empty.
+  std::unordered_set<void *> clPtrs;
+  Args args;
+  unsigned argCounter = 0;
+
+  explicit OclModuleExecutorBase(std::shared_ptr<const OclModule> &mod)
+      : mod(mod) {}
+
+#ifndef NDEBUG
+  void checkExec(const OclContext &ctx) const {
+    auto rt = OclRuntime::get(ctx.queue);
+    assert(rt);
+    assert(*rt == mod->runtime);
+    assert(argCounter == mod->argTypes.size());
+  }
+
+  void checkArg(const void *alignedPtr, bool isUsm = true) const {
+    assert(!isUsm || mod->runtime.isUsm(alignedPtr));
+  }
+#endif
+};
+
+// NOTE: This executor can only be used if mod->isStatic == true!
+template <unsigned N = 8> struct StaticExecutor : OclModuleExecutorBase<N> {
+  explicit StaticExecutor(std::shared_ptr<const OclModule> &mod)
+      : OclModuleExecutorBase<N>(mod) {
+    assert(this->mod->isStatic);
+  }
+
+  void exec(OclContext &ctx) {
+#ifndef NDEBUG
+    this->checkExec(ctx);
+#endif
+    ctx.clPtrs = &this->clPtrs;
+    this->mod->main.staticMain(&ctx, this->args.data());
+  }
+
+  void arg(void *alignedPtr, bool isUsm = true) {
+#ifndef NDEBUG
+    this->checkArg(alignedPtr, isUsm);
+    std::ostringstream oss;
+    oss << "Arg" << this->argCounter << ": alignedPtr=" << alignedPtr
+        << ", isUsm=" << (isUsm ? "true" : "false");
+    OclRuntime::debug(__FILE__, __LINE__, oss.str().c_str());
+#endif
+    ++this->argCounter;
+    this->args.emplace_back(alignedPtr);
+    if (!isUsm) {
+      this->clPtrs.insert(alignedPtr);
+    }
+  }
+
+  template <typename T> void arg(T *alignedPtr, bool isUsm = true) {
+    arg(reinterpret_cast<void *>(alignedPtr), isUsm);
+  }
+
+  void operator()(OclContext &ctx) { exec(ctx); }
+
+  template <typename T> void operator()(OclContext &ctx, T *ptr1, ...) {
+    {
+      this->reset();
+      arg(reinterpret_cast<void *>(ptr1));
+      va_list args;
+      va_start(args, ptr1);
+      for (unsigned i = 0, n = this->mod->argTypes.size() - 1; i < n; i++) {
+        arg(va_arg(args, void *));
+      }
+      va_end(args);
+      exec(ctx);
+    }
+  }
+};
+
+// The main function arguments are added in the following format -
+// https://mlir.llvm.org/docs/TargetLLVMIR/#c-compatible-wrapper-emission.
+// NOTE: This executor can only be used if mod->isStatic != true!
+template <unsigned N = 64> struct DynamicExecutor : OclModuleExecutorBase<N> {
+  explicit DynamicExecutor(std::shared_ptr<const OclModule> &mod)
+      : OclModuleExecutorBase<N>(mod) {
+    assert(!this->mod->isStatic);
+  }
+
+  void exec(OclContext &ctx) {
+#ifndef NDEBUG
+    this->checkExec(ctx);
+#endif
+    auto size = this->args.size();
+    auto ctxPtr = &ctx;
+    this->args.emplace_back(&ctxPtr);
+    this->args.emplace_back(&ctxPtr);
+    this->args.emplace_back(ZERO_PTR);
+    this->mod->main.wrappedMain(this->args.data());
+    this->args.truncate(size);
+  }
+
+  void arg(void *&alignedPtr, size_t rank, const int64_t *shape,
+           const int64_t *strides, bool isUsm = true) {
+    arg(alignedPtr, alignedPtr, ZERO, rank, shape, strides, isUsm);
+  }
+
+  // NOTE: The argument values are not copied, only the pointers are stored!
+  void arg(void *&allocatedPtr, void *&alignedPtr, const int64_t &offset,
+           size_t rank, const int64_t *shape, const int64_t *strides,
+           bool isUsm = true) {
+#ifndef NDEBUG
+    this->checkArg(alignedPtr, isUsm);
+    if (auto type =
+            llvm::dyn_cast<MemRefType>(this->getArgType(this->argCounter))) {
+      if (type.hasStaticShape()) {
+        auto size = type.getShape();
+        assert(rank == size.size());
+        for (size_t i = 0; i < rank; i++) {
+          assert(shape[i] == size[i]);
+        }
+
+        SmallVector<int64_t> expectedStrides;
+        if (int64_t expectedOffset; !failed(
+                getStridesAndOffset(type, expectedStrides, expectedOffset))) {
+          assert(expectedOffset == offset);
+          for (size_t i = 0; i < rank; i++) {
+            assert(expectedStrides[i] == strides[i]);
+          }
+        }
+      }
+    }
+
+    std::ostringstream oss;
+    oss << "Arg" << this->argCounter << ": ptr=" << allocatedPtr
+        << ", alignedPtr=" << alignedPtr
+        << ", isUsm=" << (isUsm ? "true" : "false") << ", offset=" << offset
+        << ", shape=[";
+    for (unsigned i = 0; i < rank; i++) {
+      oss << shape[i] << (i + 1 < rank ? ", " : "]");
+    }
+    oss << ", strides=[";
+    for (unsigned i = 0; i < rank; i++) {
+      oss << strides[i] << (i + 1 < rank ? ", " : "]");
+    }
+    OclRuntime::debug(__FILE__, __LINE__, oss.str().c_str());
+#endif
+
+    ++this->argCounter;
+    this->args.emplace_back(&allocatedPtr);
+    this->args.emplace_back(&alignedPtr);
+    this->args.emplace_back(const_cast<int64_t *>(&offset));
+    for (size_t i = 0; i < rank; i++) {
+      this->args.emplace_back(const_cast<int64_t *>(&shape[i]));
+    }
+    for (size_t i = 0; i < rank; i++) {
+      this->args.emplace_back(const_cast<int64_t *>(&strides[i]));
+    }
+    if (!isUsm) {
+      this->clPtrs.insert(alignedPtr);
+    }
+  }
+
+  template <typename T>
+  void arg(T *&alignedPtr, size_t rank, const int64_t *shape,
+           const int64_t *strides, bool isUsm = true) {
+    arg(reinterpret_cast<void *&>(alignedPtr), rank, shape, strides, isUsm);
+  }
+
+  template <typename T>
+  void arg(T *&allocatedPtr, T *&alignedPtr, const int64_t &offset, size_t rank,
+           const int64_t *shape, const int64_t *strides, bool isUsm = true) {
+    arg(reinterpret_cast<void *&>(allocatedPtr),
+        reinterpret_cast<void *&>(alignedPtr), offset, rank, shape, strides,
+        isUsm);
+  }
+
+  void operator()(OclContext &ctx) { exec(ctx); }
+};
+} // namespace mlir::gc::gpu
 #else
 #undef GC_GPU_OCL_CONST_ONLY
 #endif
diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h
index 558bd9b16..c468f22b3 100644
--- a/include/gc/Transforms/Passes.h
+++ b/include/gc/Transforms/Passes.h
@@ -10,6 +10,7 @@
 #define GC_PASSES_H
 
 #include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassOptions.h"
 
 namespace mlir {
 class OpBuilder;
@@ -115,7 +116,18 @@ void populateFrontendPasses(mlir::OpPassManager &);
 void populateCPUPipeline(mlir::OpPassManager &);
 
 #ifdef GC_USE_IMEX
-struct GPUPipelineOption;
+struct GPUPipelineOption : PassPipelineOptions<GPUPipelineOption> {
+  Option<bool> isUsmArgs{
+      *this, "is-usm-args",
+      llvm::cl::desc("Whether to use USM(unified shared memory) func args, in "
+                     "which the host and device could access the same buffer "
+                     "and there is no need to add memcpy explicitly."),
+      llvm::cl::init(true)};
+  Option<bool> callFinish{
+      *this, "call-finish",
+      llvm::cl::desc("Call finish() after each GPU kernel launch."),
+      llvm::cl::init(false)};
+};
 void populateGPUPipeline(mlir::OpPassManager &, const GPUPipelineOption &);
 #endif
 
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
index 2ddf0a06e..fb5581bb5 100644
--- a/include/gc/Transforms/Passes.td
+++ b/include/gc/Transforms/Passes.td
@@ -106,6 +106,11 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
   let description = [{
     Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls.
   }];
+  let options = [
+    Option<"callFinish", "call-finish", "bool",
+           /*default=*/"false",
+           "Call finish() after each kernel launch.">
+    ];
 }
 #endif // GC_USE_IMEX
 
diff --git a/include/gc/Utils/Error.h b/include/gc/Utils/Error.h
new file mode 100644
index 000000000..6694e14fe
--- /dev/null
+++ b/include/gc/Utils/Error.h
@@ -0,0 +1,58 @@
+//===-- Error.h - Error processing functions --------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GC_ERROR_H
+#define GC_ERROR_H
+
+#include <sstream>
+
+#include "gc/Utils/Log.h"
+
+#include "llvm/Support/Error.h"
+
+namespace mlir::gc::err {
+#ifdef NDEBUG
+#define GC_ERR_LOC_DECL
+#define GC_ERR_LOC_ARGS
+#define GC_ERR_LOC
+#else
+#define GC_ERR_LOC_DECL const char *fileName, int lineNum,
+#define GC_ERR_LOC_ARGS fileName, lineNum,
+#define GC_ERR_LOC __FILE__, __LINE__,
+#endif
+
+#define gcMakeErr(...) mlir::gc::err::makeLlvmError(GC_ERR_LOC __VA_ARGS__)
+#define gcReportErr(...)                                                       \
+  mlir::gc::err::report(GC_ERR_LOC gcMakeErr(__VA_ARGS__))
+#define gcGetOrReport(expected) mlir::gc::err::getOrReport(GC_ERR_LOC expected)
+
+template <typename... Args>
+[[nodiscard]] llvm::Error makeLlvmError(GC_ERR_LOC_DECL Args... args) {
+  log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR", args...);
+  std::ostringstream oss;
+  log::insertArgs(oss, args...);
+  auto msg = oss.str();
+  return llvm::make_error<llvm::StringError>(msg.substr(0, msg.length() - 1),
+                                             llvm::inconvertibleErrorCode());
+}
+
+[[noreturn]] static void report(GC_ERR_LOC_DECL llvm::Error err) {
+  log::insetLog(GC_ERR_LOC_ARGS std::cerr, "ERROR", "Unrecoverable error!");
+  report_fatal_error(std::move(err));
+}
+
+template <typename T>
+T getOrReport(GC_ERR_LOC_DECL llvm::Expected<T> expected) {
+  if (expected) {
+    return *expected;
+  }
+  report(GC_ERR_LOC_ARGS expected.takeError());
+}
+} // namespace mlir::gc::err
+
+#endif
diff --git a/include/gc/Utils/Log.h b/include/gc/Utils/Log.h
new file mode 100644
index 000000000..dc7bfc6f8
--- /dev/null
+++ b/include/gc/Utils/Log.h
@@ -0,0 +1,51 @@
+//===-- Log.h - Logging functions -------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GC_LOG_H
+#define GC_LOG_H
+#include <iostream>
+
+namespace mlir::gc::log {
+static void insertArgs(std::ostream &stream) { stream << std::endl; }
+
+template <typename T, typename... Args>
+static void insertArgs(std::ostream &stream, T first, Args... args) {
+  stream << first;
+  insertArgs(stream, args...);
+}
+
+template <typename... Args>
+static void insetLog(
+#ifndef NDEBUG
+    const char *fileName, int lineNum,
+#endif
+    std::ostream &stream, const char *pref, Args... args) {
+  stream << "[" << pref << "] ";
+#ifndef NDEBUG
+  stream << "[" << fileName << ":" << lineNum << "] ";
+#endif
+  insertArgs(stream, args...);
+}
+
+#ifdef NDEBUG
+#define gcLogD(...)
+#define gcLogE(...) mlir::gc::log::insetLog(std::cerr, "ERROR", __VA_ARGS__)
+#else
+#define _insetLog(stream, pref, ...)                                           \
+  mlir::gc::log::insetLog(__FILE__, __LINE__, stream, pref, __VA_ARGS__)
+#define gcLogD(...) _insetLog(std::cout, "DEBUG", __VA_ARGS__)
+#define gcLogE(...) _insetLog(std::cerr, "ERROR", __VA_ARGS__)
+#endif
+} // namespace mlir::gc::log
+
+#ifdef GC_LOG_NO_DEBUG
+#undef gcLogD
+#define gcLogD(...)
+#endif
+
+#endif
diff --git a/lib/gc/ExecutionEngine/CMakeLists.txt b/lib/gc/ExecutionEngine/CMakeLists.txt
index f13a27b1a..0985f1536 100644
--- a/lib/gc/ExecutionEngine/CMakeLists.txt
+++ b/lib/gc/ExecutionEngine/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory(CPURuntime)
 add_subdirectory(Driver)
 if(GC_ENABLE_IMEX)
+    add_subdirectory(GPURuntime)
     add_subdirectory(OpenCLRuntime)
 endif()
\ No newline at end of file
diff --git a/lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt
new file mode 100644
index 000000000..8aeab1db4
--- /dev/null
+++ b/lib/gc/ExecutionEngine/GPURuntime/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(ocl)
diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt b/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt
new file mode 100644
index 000000000..37cd56b85
--- /dev/null
+++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt
@@ -0,0 +1,10 @@
+find_package(OpenCL REQUIRED)
+
+gc_add_mlir_library(GcGpuOclRuntime
+  GpuOclRuntime.cpp
+  LINK_LIBS PUBLIC
+  GcInterface
+  GcGpuPasses
+)
+target_include_directories(GcGpuOclRuntime PUBLIC ${OpenCL_INCLUDE_DIRS})
+target_link_libraries(GcGpuOclRuntime PUBLIC ${OpenCL_LIBRARIES})
diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
new file mode 100644
index 000000000..90a6b5cf5
--- /dev/null
+++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
@@ -0,0 +1,838 @@
+//===-- GpuOclRuntime.cpp - GPU OpenCL Runtime ------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/cl_ext.h>
+
+// Comment out the following line to enable debug logging
+// #define GC_LOG_NO_DEBUG
+#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h"
+#include "gc/Transforms/Passes.h"
+#include "gc/Utils/Error.h"
+#include "gc/Utils/Log.h"
+
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/Support/Error.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir::gc::gpu {
+
+#define makeClErrPref(code) "OpenCL error ", code, ": "
+#define makeClErr(code, ...) gcMakeErr(makeClErrPref(code), __VA_ARGS__)
+#define reportClErr(code, ...) gcReportErr(makeClErrPref(code), __VA_ARGS__)
+
+#define CHECK(cond, ...)                                                       \
+  do {                                                                         \
+    if (!(cond))                                                               \
+      return gcMakeErr(__VA_ARGS__);                                           \
+  } while (0)
+#define CHECKE(expected, ...)                                                  \
+  do {                                                                         \
+    if (!expected) {                                                           \
+      gcLogE(__VA_ARGS__);                                                     \
+      return expected.takeError();                                             \
+    }                                                                          \
+  } while (0)
+#define CL_CHECK(expr, ...)                                                    \
+  do {                                                                         \
+    if (auto _cl_check_err = (expr); _cl_check_err != CL_SUCCESS)              \
+      return makeClErr(_cl_check_err, __VA_ARGS__);                            \
+  } while (0)
+#define CL_CHECKR(expr, ...)                                                   \
+  do {                                                                         \
+    if (auto _cl_check_err = (expr); _cl_check_err != CL_SUCCESS) {            \
+      reportClErr(_cl_check_err, __VA_ARGS__);                                 \
+    }                                                                          \
+  } while (0)
+
+// cl_ext function pointers
+struct OclRuntime::Ext : OclDevCtxPair {
+  clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL;
+  clSharedMemAllocINTEL_fn clSharedMemAllocINTEL;
+  clMemFreeINTEL_fn clMemFreeINTEL;
+  clEnqueueMemcpyINTEL_fn clEnqueueMemcpyINTEL;
+  clGetMemAllocInfoINTEL_fn clGetMemAllocInfoINTEL;
+  clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL;
+
+  explicit Ext(cl_device_id device, cl_context context,
+               clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL,
+               clSharedMemAllocINTEL_fn clSharedMemAllocINTEL,
+               clMemFreeINTEL_fn clMemFreeINTEL,
+               clEnqueueMemcpyINTEL_fn clEnqueueMemcpyINTEL,
+               clGetMemAllocInfoINTEL_fn clGetMemAllocInfoINTEL,
+               clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL)
+      : OclDevCtxPair(device, context),
+        clDeviceMemAllocINTEL(clDeviceMemAllocINTEL),
+        clSharedMemAllocINTEL(clSharedMemAllocINTEL),
+        clMemFreeINTEL(clMemFreeINTEL),
+        clEnqueueMemcpyINTEL(clEnqueueMemcpyINTEL),
+        clGetMemAllocInfoINTEL(clGetMemAllocInfoINTEL),
+        clSetKernelArgMemPointerINTEL(clSetKernelArgMemPointerINTEL) {}
+
+  static llvm::Expected<const Ext *> get(cl_device_id device,
+                                         cl_context context) {
+    static std::shared_mutex mux;
+    static std::unordered_map<const OclDevCtxPair, const Ext *> cache;
+
+    OclDevCtxPair pair{device, context};
+    {
+      std::shared_lock<std::shared_mutex> lock(mux);
+      if (auto it = cache.find(pair); it != cache.end()) {
+        return it->second;
+      }
+    }
+
+    cl_platform_id platform;
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                             &platform, nullptr),
+             "Failed to get the device platform.");
+
+#define FIND_FUNC(name)                                                        \
+  auto name = reinterpret_cast<name##_fn>(                                     \
+      clGetExtensionFunctionAddressForPlatform(platform, #name));              \
+  CHECK(name, "Failed to get the " #name " function address.")
+
+    FIND_FUNC(clDeviceMemAllocINTEL);
+    FIND_FUNC(clSharedMemAllocINTEL);
+    FIND_FUNC(clMemFreeINTEL);
+    FIND_FUNC(clEnqueueMemcpyINTEL);
+    FIND_FUNC(clGetMemAllocInfoINTEL);
+    FIND_FUNC(clSetKernelArgMemPointerINTEL);
+
+    std::lock_guard<std::shared_mutex> lock(mux);
+    if (auto it = cache.find(pair); it != cache.end()) {
+      return it->second;
+    }
+    auto ext =
+        new Ext(device, context, clDeviceMemAllocINTEL, clSharedMemAllocINTEL,
+                clMemFreeINTEL, clEnqueueMemcpyINTEL, clGetMemAllocInfoINTEL,
+                clSetKernelArgMemPointerINTEL);
+    cache.emplace(pair, ext);
+    return ext;
+  }
+};
+
+struct Kernel {
+  cl_program program;
+  cl_kernel kernel;
+  const size_t globalSize[3];
+  const size_t localSize[3];
+  const SmallVector<size_t> argSize;
+
+  explicit Kernel(cl_program program, cl_kernel kernel, const size_t *gridSize,
+                  const size_t *blockSize, size_t argNum, const size_t *argSize)
+      : program(program),
+        kernel(kernel), globalSize{gridSize[0] * blockSize[0],
+                                   gridSize[1] * blockSize[1],
+                                   gridSize[2] * blockSize[2]},
+        localSize{blockSize[0], blockSize[1], blockSize[2]},
+        argSize(argSize, argSize + argNum) {
+#ifndef NDEBUG
+    std::string args;
+    for (size_t i = 0; i < argNum; i++) {
+      args += std::to_string(argSize[i]);
+      if (i < argNum - 1) {
+        args += ", ";
+      }
+    }
+    gcLogD("Kernel ", kernel, " params: globalSize=[", globalSize[0], ", ",
+           globalSize[1], ", ", globalSize[2], "], localSize=[", localSize[0],
+           ", ", localSize[1], ", ", localSize[2], "], argSize=[", args.c_str(),
+           "]");
+#endif
+  }
+
+  ~Kernel() {
+    CL_CHECKR(clReleaseKernel(kernel), "Failed to release OpenCL kernel.");
+    gcLogD("Released OpenCL kernel: ", kernel);
+    CL_CHECKR(clReleaseProgram(program), "Failed to release OpenCL program.");
+    gcLogD("Released OpenCL program: ", program);
+  }
+};
+
+// Functions exported to the ExecutionEngine
+struct OclRuntime::Exports {
+  static llvm::orc::SymbolMap symbolMap(llvm::orc::MangleAndInterner interner) {
+    llvm::orc::SymbolMap map;
+    map.reserve(8);
+    map.try_emplace(interner(GPU_OCL_MALLOC_DEV),
+                    llvm::orc::ExecutorAddr::fromPtr(&allocDev),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_MALLOC_SHARED),
+                    llvm::orc::ExecutorAddr::fromPtr(&allocShared),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_DEALLOC),
+                    llvm::orc::ExecutorAddr::fromPtr(&dealloc),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_MEMCPY),
+                    llvm::orc::ExecutorAddr::fromPtr(&memcpy),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_KERNEL_CREATE),
+                    llvm::orc::ExecutorAddr::fromPtr(&kernelCreate),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_KERNEL_DESTROY),
+                    llvm::orc::ExecutorAddr::fromPtr(&kernelDestroy),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_KERNEL_LAUNCH),
+                    llvm::orc::ExecutorAddr::fromPtr(&kernelLaunch),
+                    llvm::JITSymbolFlags::Exported);
+    map.try_emplace(interner(GPU_OCL_FINISH),
+                    llvm::orc::ExecutorAddr::fromPtr(&finish),
+                    llvm::JITSymbolFlags::Exported);
+    return map;
+  }
+
+private:
+  static void *allocDev(const OclContext *ctx, size_t size) {
+    return gcGetOrReport(ctx->runtime.usmAllocDev(size));
+  }
+
+  static void *allocShared(const OclContext *ctx, size_t size) {
+    return gcGetOrReport(ctx->runtime.usmAllocShared(size));
+  }
+
+  static void dealloc(const OclContext *ctx, const void *ptr) {
+    gcGetOrReport(ctx->runtime.usmFree(ptr));
+  }
+
+  static void memcpy(OclContext *ctx, const void *src, void *dst, size_t size) {
+    gcGetOrReport(ctx->runtime.usmCpy(*ctx, src, dst, size));
+  }
+
+  static Kernel *kernelCreate(const OclContext *ctx, size_t spirvLen,
+                              const unsigned char *spirv, const char *name,
+                              const size_t *gridSize, const size_t *blockSize,
+                              size_t argNum, const size_t *argSize) {
+    cl_int err;
+    auto program =
+        clCreateProgramWithIL(ctx->runtime.ext.context, spirv, spirvLen, &err);
+    CL_CHECKR(err, "Failed to create OpenCL program with IL.");
+
+    gcLogD("Created new OpenCL program: ", program);
+    clBuildProgram(program, 1, &ctx->runtime.ext.device, nullptr, nullptr,
+                   nullptr);
+    CL_CHECKR(err, "Failed to build the program: ", program);
+    gcLogD("The program has been built: ", program);
+
+    auto kernel = clCreateKernel(program, name, &err);
+    CL_CHECKR(err, "Failed to create OpenCL kernel from program: ", program);
+    gcLogD("Created new OpenCL kernel ", kernel, " from program ", program);
+
+    cl_bool enable = CL_TRUE;
+    err = clSetKernelExecInfo(kernel,
+                              CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL,
+                              sizeof(enable), &enable);
+    CL_CHECKR(err, "Failed to set indirect host access.");
+    err = clSetKernelExecInfo(kernel,
+                              CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL,
+                              sizeof(enable), &enable);
+    CL_CHECKR(err, "Failed to set indirect device access.");
+    err = clSetKernelExecInfo(kernel,
+                              CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL,
+                              sizeof(enable), &enable);
+    CL_CHECKR(err, "Failed to set indirect shared access.");
+
+    return new Kernel(program, kernel, gridSize, blockSize, argNum, argSize);
+  }
+
+  static void kernelDestroy(size_t count, Kernel **kernels) {
+    gcLogD("Destroying kernels.");
+    for (size_t i = 0; i < count; i++) {
+      if (kernels[i]) {
+        delete kernels[i];
+      }
+    }
+  }
+
+  static void kernelLaunch(OclContext *ctx, Kernel *kernel, ...) {
+    struct ClonedKernel {
+      cl_kernel kernel;
+
+      explicit ClonedKernel(cl_kernel kernel) : kernel(kernel) {}
+
+      ~ClonedKernel() {
+        gcLogD("Releasing cloned OpenCL kernel: ", kernel);
+        CL_CHECKR(clReleaseKernel(kernel),
+                  "Failed to release the kernel: ", kernel);
+      }
+    };
+
+    gcLogD("Launching kernel: ", kernel->kernel);
+
+    cl_int err;
+    ClonedKernel cloned{clCloneKernel(kernel->kernel, &err)};
+    CL_CHECKR(err, "Failed to clone OpenCL kernel: ", kernel->kernel);
+    gcLogD("Cloned OpenCL kernel ", kernel->kernel, ": ", cloned.kernel);
+
+    va_list args;
+    va_start(args, kernel);
+    for (size_t i = 0, n = kernel->argSize.size(); i < n; i++) {
+      auto size = kernel->argSize[i];
+      void *ptr = va_arg(args, void *);
+
+      if (size) {
+        gcLogD("Setting kernel ", cloned.kernel, " argument ", i, " to ",
+               *static_cast<int64_t *>(ptr));
+        err = clSetKernelArg(cloned.kernel, i, size, ptr);
+      } else if (ctx->clPtrs->find(ptr) == ctx->clPtrs->end()) {
+        gcLogD("Setting kernel ", cloned.kernel, " argument ", i,
+               " to USM pointer ", ptr);
+        err = ctx->runtime.ext.clSetKernelArgMemPointerINTEL(cloned.kernel, i,
+                                                             ptr);
+      } else {
+        gcLogD("Setting kernel ", cloned.kernel, " argument ", i,
+               " to CL pointer ", ptr);
+        err = clSetKernelArg(cloned.kernel, i, sizeof(cl_mem), &ptr);
+      }
+
+      CL_CHECKR(err, "Failed to set kernel ", cloned.kernel, " argument ", i,
+                " of size ", size);
+    }
+    va_end(args);
+
+    if (ctx->preserveOrder) {
+      cl_event event = nullptr;
+      err = clEnqueueNDRangeKernel(ctx->queue, cloned.kernel, 3, nullptr,
+                                   kernel->globalSize, kernel->localSize,
+                                   ctx->waitListLen, ctx->waitList, &event);
+      ctx->setLastEvent(event);
+    } else {
+      err = clEnqueueNDRangeKernel(ctx->queue, cloned.kernel, 3, nullptr,
+                                   kernel->globalSize, kernel->localSize, 0,
+                                   nullptr, nullptr);
+    }
+
+    CL_CHECKR(err, "Failed to enqueue kernel execution: ", cloned.kernel);
+    gcLogD("Enqueued kernel execution: ", cloned.kernel);
+  }
+
+  static void finish(OclContext *ctx) { gcGetOrReport(ctx->finish()); }
+};
+
+OclRuntime::OclRuntime(const Ext &ext) : ext(ext) {}
+
+llvm::Expected<SmallVector<cl_device_id, 2>>
+OclRuntime::gcIntelDevices(size_t max) {
+  SmallVector<cl_device_id, 2> intelDevices;
+  if (max == 0) {
+    return intelDevices;
+  }
+
+  cl_uint numPlatforms;
+  CL_CHECK(clGetPlatformIDs(0, nullptr, &numPlatforms),
+           "Failed to get the number of platforms.");
+
+  if (numPlatforms == 0) {
+    gcLogD("No platforms found.");
+    return intelDevices;
+  }
+
+  SmallVector<cl_platform_id> platforms(numPlatforms);
+  auto err = clGetPlatformIDs(numPlatforms, platforms.data(), nullptr);
+  if (err != CL_SUCCESS) {
+    gcLogE("Failed to get the platform ids. Error: ", err);
+    return intelDevices;
+  }
+
+  for (auto platform : platforms) {
+    cl_uint numDevices;
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, nullptr, &numDevices);
+    if (err != CL_SUCCESS) {
+      gcLogE("Failed to get the number of devices on the platform.", platform,
+             " Error: ", err);
+      continue;
+    }
+    if (numDevices == 0) {
+      continue;
+    }
+
+    SmallVector<cl_device_id> devices(numDevices);
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices,
+                         devices.data(), nullptr);
+    if (err != CL_SUCCESS) {
+      gcLogE("Failed to get the device ids on the platform ", platform,
+             ". Error: ", err);
+      continue;
+    }
+
+    for (auto dev : devices) {
+      cl_uint vendorId;
+      err = clGetDeviceInfo(dev, CL_DEVICE_VENDOR_ID, sizeof(cl_uint),
+                            &vendorId, nullptr);
+      if (err != CL_SUCCESS) {
+        gcLogE("Failed to get info about the device ", dev, ". Error: ", err);
+        continue;
+      }
+      if (vendorId == 0x8086) {
+        intelDevices.emplace_back(dev);
+#ifndef NDEBUG
+        size_t nameSize;
+        std::string name;
+        clGetDeviceInfo(dev, CL_DEVICE_NAME, 0, nullptr, &nameSize);
+        name.resize(nameSize);
+        clGetDeviceInfo(dev, CL_DEVICE_NAME, nameSize, &name[0], nullptr);
+        gcLogD("[ INFO ] GPU device ", name.c_str(), " id: ", dev);
+#endif
+        if (intelDevices.size() == max) {
+          return intelDevices;
+        }
+      }
+    }
+  }
+
+  return intelDevices;
+}
+
+llvm::Expected<OclRuntime> OclRuntime::get() {
+  static OclRuntime *defaultRuntimePtr = nullptr;
+  if (OclRuntime *rt = defaultRuntimePtr) {
+    return *rt;
+  }
+
+  auto devices = gcIntelDevices(1);
+  CHECKE(devices, "Failed to get Intel GPU devices.");
+  if (devices->empty()) {
+    return gcMakeErr("No Intel GPU devices found.");
+  }
+
+  auto rt = get(devices.get()[0]);
+  CHECKE(rt, "Failed to create OclRuntime.");
+
+  static OclRuntime defaultRuntime = rt.get();
+  defaultRuntimePtr = &defaultRuntime;
+  return defaultRuntime;
+}
+
+llvm::Expected<OclRuntime> OclRuntime::get(cl_device_id device) {
+  static std::shared_mutex mux;
+  static std::unordered_map<cl_device_id, cl_context> cache;
+  cl_context context = nullptr;
+
+  {
+    std::shared_lock<std::shared_mutex> lock(mux);
+    if (auto it = cache.find(device); it != cache.end()) {
+      context = it->second;
+    }
+  }
+
+  if (context) {
+    return get(device, context);
+  }
+
+  cl_int err;
+  context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
+  CL_CHECK(err, "Failed to create OpenCL context.");
+  gcLogD("Created new OpenCL context: ", context);
+
+  {
+    std::lock_guard<std::shared_mutex> lock(mux);
+    if (auto it = cache.find(device); it != cache.end()) {
+      if (clReleaseContext(context) != CL_SUCCESS) {
+        gcLogE("Failed to release OpenCL context: ", context);
+      } else {
+        gcLogD("Released OpenCL context: ", context);
+      }
+      context = it->second;
+    } else {
+      cache.emplace(device, context);
+    }
+  }
+
+  return get(device, context);
+}
+
+llvm::Expected<OclRuntime> OclRuntime::get(cl_command_queue queue) {
+  cl_device_id device;
+  cl_context context;
+  CL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id),
+                                 &device, nullptr),
+           "Failed to get CL_QUEUE_DEVICE.");
+  CL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context),
+                                 &context, nullptr),
+           "Failed to get CL_QUEUE_CONTEXT.");
+  assert(device);
+  assert(context);
+  return get(device, context);
+}
+
+llvm::Expected<OclRuntime> OclRuntime::get(cl_device_id device,
+                                           cl_context context) {
+  auto ext = Ext::get(device, context);
+  CHECKE(ext, "Failed to create OclRuntime::Ext.");
+  return OclRuntime{*ext.get()};
+}
+
+bool OclRuntime::isOutOfOrder(cl_command_queue queue) {
+  cl_command_queue_properties properties;
+  cl_int err = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+                                     sizeof(cl_command_queue_properties),
+                                     &properties, nullptr);
+  if (err != CL_SUCCESS) {
+    gcLogE("clGetCommandQueueInfo() failed with error code ", err);
+    // Enforcing out-of-order execution mode
+    return true;
+  }
+  return properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+}
+
+cl_context OclRuntime::getContext() const { return ext.context; }
+
+cl_device_id OclRuntime::getDevice() const { return ext.device; }
+
+llvm::Expected<cl_command_queue>
+OclRuntime::createQueue(bool outOfOrder) const {
+  cl_int err;
+  cl_command_queue queue;
+#ifdef CL_VERSION_2_0
+  cl_queue_properties properties[] = {
+      CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+      static_cast<cl_queue_properties>(outOfOrder ? 1 : 0)};
+  queue = clCreateCommandQueueWithProperties(ext.context, ext.device,
+                                             properties, &err);
+#else
+  const cl_command_queue_properties properties =
+      outOfOrder ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
+  queue = clCreateCommandQueue(context, device, properties, &err);
+#endif
+  CL_CHECK(err, "Failed to create ", outOfOrder ? "out-of-order " : "",
+           "OpenCL command queue.");
+  gcLogD("Created new ", outOfOrder ? "out-of-order " : "",
+         "OpenCL command queue: ", queue);
+  return queue;
+}
+
+llvm::Expected<bool> OclRuntime::releaseQueue(cl_command_queue queue) {
+  CL_CHECK(clReleaseCommandQueue(queue),
+           "Failed to release OpenCL command queue: ", queue);
+  gcLogD("Released OpenCL command queue: ", queue);
+  return true;
+}
+
+llvm::Expected<void *> OclRuntime::usmAllocDev(size_t size) const {
+  cl_int err;
+  void *ptr = ext.clDeviceMemAllocINTEL(ext.context, ext.device, nullptr, size,
+                                        0, &err);
+  CL_CHECK(err, "Failed to allocate ", size, " bytes of device USM memory.");
+  gcLogD("Allocated ", size, " bytes of device USM memory: ", ptr);
+  return ptr;
+}
+
+llvm::Expected<void *> OclRuntime::usmAllocShared(size_t size) const {
+  cl_int err;
+  void *ptr = ext.clSharedMemAllocINTEL(ext.context, ext.device, nullptr, size,
+                                        0, &err);
+  CL_CHECK(err, "Failed to allocate ", size, " bytes of shared USM memory.");
+  gcLogD("Allocated ", size, " bytes of shared USM memory: ", ptr);
+  return ptr;
+}
+
+llvm::Expected<bool> OclRuntime::usmFree(const void *ptr) const {
+  CL_CHECK(ext.clMemFreeINTEL(ext.context, const_cast<void *>(ptr)),
+           "Failed to free USM memory: ", ptr);
+  gcLogD("Deallocated USM memory: ", ptr);
+  return true;
+}
+
+llvm::Expected<bool> OclRuntime::usmCpy(OclContext &ctx, const void *src,
+                                        void *dst, size_t size) const {
+  cl_int err;
+  if (ctx.preserveOrder) {
+    cl_event event;
+    err = ext.clEnqueueMemcpyINTEL(ctx.queue, false, dst, src, size,
+                                   ctx.waitListLen, ctx.waitList, &event);
+    ctx.setLastEvent(event);
+  } else {
+    err = ext.clEnqueueMemcpyINTEL(ctx.queue, false, dst, src, size, 0, nullptr,
+                                   nullptr);
+  }
+  CL_CHECK(err, "Failed to copy ", size, " bytes from ", src, " to ", dst);
+  gcLogD("Enqueued USM memory copy of ", size, " bytes from ", src, " to ",
+         dst);
+  return true;
+}
+
+bool OclRuntime::isUsm(const void *ptr) const {
+  cl_mem_info_intel allocType;
+  auto err = ext.clGetMemAllocInfoINTEL(
+      ext.context, ptr, CL_MEM_ALLOC_TYPE_INTEL, sizeof(cl_mem_info_intel),
+      &allocType, nullptr);
+  return err == CL_SUCCESS && allocType != CL_MEM_TYPE_UNKNOWN_INTEL;
+}
+
+#ifndef NDEBUG
+void OclRuntime::debug(const char *file, int line, const char *msg) {
+#ifndef GC_LOG_NO_DEBUG
+  mlir::gc::log::insetLog(file, line, std::cout, "DEBUG", msg);
+#endif
+}
+#endif
+
+llvm::Expected<bool> OclContext::finish() {
+  gcLogD("Waiting for the enqueued OpenCL commands to finish: ", queue);
+  CL_CHECK(clFinish(queue),
+           "Failed to finish the OpenCL command queue: ", queue);
+  if (preserveOrder) {
+    waitListLen = 0;
+    waitList = nullptr;
+    lastEvent = nullptr;
+  }
+  return true;
+}
+
+OclModule::~OclModule() {
+  assert(engine);
+  auto fn = engine->lookup(GPU_OCL_MOD_DESTRUCTOR);
+  if (fn) {
+    reinterpret_cast<void (*)()>(fn.get())();
+  } else {
+    gcLogE("Module function ", GPU_OCL_MOD_DESTRUCTOR, " not found!");
+  }
+}
+
+// If all arguments of 'origFunc' are memrefs with static shape, create a new
+// function called gcGpuOclStaticMain, that accepts 2 arguments: a pointer to
+// OclContext and a pointer to an array, containing pointers to aligned memory
+// buffers. The function will call the original function with the context,
+// buffers and the offset/shape/strides, statically created from the
+// memref descriptor.
+StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
+                           const ArrayRef<Type> argTypes) {
+  auto mainFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(funcName);
+  if (!mainFunc) {
+    gcReportErr("The function '", funcName.begin(), "' not found.");
+  }
+
+  // Check that the last 3 args are added by AddContextArg
+  auto mainArgTypes = mainFunc.getArgumentTypes();
+  auto nargs = mainArgTypes.size();
+  if (nargs < 3) {
+    gcReportErr("The function '", funcName.begin(),
+                "' must have an least 3 arguments.");
+  }
+
+  auto ctx = module.getContext();
+  ctx->getOrLoadDialect<LLVM::LLVMDialect>();
+  OpBuilder builder(ctx);
+  auto i64Type = builder.getI64Type();
+  auto ptrType = LLVM::LLVMPointerType::get(ctx);
+
+  if (mainArgTypes[nargs - 3] != ptrType ||
+      mainArgTypes[nargs - 2] != ptrType ||
+      mainArgTypes[nargs - 1] != i64Type) {
+    gcReportErr("The last 3 arguments of the function '", funcName.begin(),
+                "' must be of type (!llvm.ptr, !llvm.ptr, i64).");
+  }
+
+  // argTypes contains only the original arguments, before lowering
+  nargs = argTypes.size();
+  if (nargs == 0) {
+    // This is a no-arg function with the context param added by AddContextArg
+    return funcName;
+  }
+
+  mainFunc.setAlwaysInline(true);
+  SmallVector<int64_t, 64> constArgs;
+  unsigned argsCounter = 0;
+
+  for (unsigned i = 0; i < nargs; ++i) {
+    if (auto type = mlir::dyn_cast<MemRefType>(argTypes[i])) {
+      if (!type.hasStaticShape()) {
+        gcLogD("The argument ", i, " of the function ", funcName.begin(),
+               " has a dynamic shape.");
+        return {};
+      }
+
+      auto shape = type.getShape();
+      auto offsetPtr = constArgs.end();
+      constArgs.emplace_back(0);
+      constArgs.append(shape.begin(), shape.end());
+      if (failed(getStridesAndOffset(type, constArgs, *offsetPtr))) {
+        gcLogD("Failed to get strides and offset of arg", i,
+               " of the function ", funcName.begin());
+        return {};
+      }
+      argsCounter += shape.size() * 2 + 3;
+    } else {
+      gcLogD("The argument ", i, " of the function ", funcName.begin(),
+             " is not of type MemRefType.");
+      return {};
+    }
+  }
+
+  auto loc = mainFunc.getLoc();
+  auto newFuncType = LLVM::LLVMFunctionType::get(
+      mainFunc.getNumResults() ? mainFunc->getResult(0).getType()
+                               : LLVM::LLVMVoidType::get(ctx),
+      {ptrType, ptrType});
+  auto newFunc =
+      OpBuilder::atBlockEnd(module.getBody())
+          .create<LLVM::LLVMFuncOp>(loc, "gcGpuOclStaticMain", newFuncType);
+  auto &entryBlock = *newFunc.addEntryBlock(builder);
+  builder.setInsertionPointToStart(&entryBlock);
+  Value arrayPtr = entryBlock.getArgument(1);
+
+  std::unordered_map<int64_t, Value> constMap;
+  auto createConst = [&](int64_t i) {
+    if (auto v = constMap.find(i); v != constMap.end()) {
+      return v->second;
+    }
+    return constMap
+        .emplace(i, builder.create<LLVM::ConstantOp>(
+                        loc, i64Type, builder.getIntegerAttr(i64Type, i)))
+        .first->second;
+  };
+  Value zero = createConst(0);
+  Value one = nargs ? createConst(1) : Value{};
+  SmallVector<Value, 64> args;
+  args.reserve(argsCounter);
+
+  for (unsigned i = 0, j = 0; i < nargs; i++) {
+    if (i != 0) {
+      arrayPtr =
+          builder.create<LLVM::GEPOp>(loc, ptrType, ptrType, arrayPtr, one);
+    }
+
+    auto ptr = builder.create<LLVM::LoadOp>(loc, ptrType, arrayPtr);
+    args.emplace_back(ptr);
+    args.emplace_back(ptr);
+    args.emplace_back(createConst(constArgs[j++]));
+
+    for (unsigned k = 0,
+                  m = 2 * mlir::cast<MemRefType>(argTypes[i]).getShape().size();
+         k < m; k++) {
+      args.emplace_back(createConst(constArgs[j++]));
+    }
+  }
+
+  auto oclCtxArg = entryBlock.getArgument(0);
+  args.emplace_back(oclCtxArg);
+  args.emplace_back(oclCtxArg);
+  args.emplace_back(zero);
+
+  auto call = builder.create<LLVM::CallOp>(loc, mainFunc, args);
+  builder.create<LLVM::ReturnOp>(loc, call.getResults());
+  return newFunc.getName();
+}
+
+StringRef getFuncName(const OclModuleBuilderOpts &opts, ModuleOp &mod) {
+  if (!opts.funcName.empty()) {
+    return opts.funcName;
+  }
+  for (auto &op : mod.getBody()->getOperations()) {
+    if (auto fn = dyn_cast<func::FuncOp>(op);
+        fn && !fn.isExternal() && fn.isPublic()) {
+      return fn.getName();
+    }
+    if (auto fn = dyn_cast<LLVM::LLVMFuncOp>(op);
+        fn && !fn.isExternal() && fn.isPublic()) {
+      return fn.getName();
+    }
+  }
+  gcReportErr("Failed to find a public function in the module.");
+}
+
+ArrayRef<Type> getArgTypes(const StringRef &funcName, ModuleOp &mod) {
+  if (auto fn = mod.lookupSymbol<func::FuncOp>(funcName)) {
+    return fn.getArgumentTypes();
+  }
+  if (auto fn = mod.lookupSymbol<LLVM::LLVMFuncOp>(funcName)) {
+    return fn.getArgumentTypes();
+  }
+  gcReportErr("Failed to find the function '", funcName.begin(),
+              "' in the module.");
+}
+
+OclModuleBuilder::OclModuleBuilder(ModuleOp module,
+                                   const OclModuleBuilderOpts &opts)
+    : mlirModule(module), enableObjectDump(opts.enableObjectDump),
+      sharedLibPaths(opts.sharedLibPaths),
+      pipeline(opts.pipeline
+                   ? opts.pipeline
+                   : [](OpPassManager &pm) { populateGPUPipeline(pm, {}); }),
+      funcName(getFuncName(opts, mlirModule)),
+      argTypes(getArgTypes(funcName, mlirModule)) {}
+
+llvm::Expected<std::shared_ptr<const OclModule>>
+OclModuleBuilder::build(const OclRuntime &runtime) {
+  {
+    std::shared_lock<std::shared_mutex> lock(mux);
+    if (auto it = cache.find(runtime.ext); it != cache.end()) {
+      return it->second;
+    }
+  }
+  return build(runtime.ext);
+}
+
+llvm::Expected<std::shared_ptr<const OclModule>>
+OclModuleBuilder::build(cl_command_queue queue) {
+  auto rt = OclRuntime::get(queue);
+  CHECKE(rt, "Failed to create OclRuntime.");
+  return build(rt.get());
+}
+
+llvm::Expected<std::shared_ptr<const OclModule>>
+OclModuleBuilder::build(cl_device_id device, cl_context context) {
+  {
+    OclDevCtxPair pair{device, context};
+    std::shared_lock<std::shared_mutex> lock(mux);
+    if (auto it = cache.find(pair); it != cache.end()) {
+      return it->second;
+    }
+  }
+
+  auto ext = OclRuntime::Ext::get(device, context);
+  CHECKE(ext, "Failed to create OclRuntime::Ext.");
+  return build(*ext.get());
+}
+
+llvm::Expected<std::shared_ptr<const OclModule>>
+OclModuleBuilder::build(const OclRuntime::Ext &ext) {
+  auto mod = mlirModule.clone();
+  PassManager pm{mod.getContext()};
+  pipeline(pm);
+  CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
+
+  auto staticMain = createStaticMain(mod, funcName, argTypes);
+
+  ExecutionEngineOptions opts;
+  opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive;
+  opts.enableObjectDump = enableObjectDump;
+#ifdef NDEBUG
+  opts.enableGDBNotificationListener = false;
+  opts.enablePerfNotificationListener = false;
+#endif
+
+  auto eng = ExecutionEngine::create(mod, opts);
+  CHECKE(eng, "Failed to create ExecutionEngine!");
+  eng->get()->registerSymbols(OclRuntime::Exports::symbolMap);
+
+  OclModule::MainFunc main = {nullptr};
+
+  if (staticMain.empty()) {
+    auto expect = eng.get()->lookupPacked(funcName);
+    CHECKE(expect, "Packed function '", funcName.begin(), "' not found!");
+    main.wrappedMain = *expect;
+  } else {
+    auto expect = eng.get()->lookup(staticMain);
+    CHECKE(expect, "Compiled function '", staticMain.begin(), "' not found!");
+    main.staticMain = reinterpret_cast<OclModule::StaticMainFunc>(*expect);
+  }
+
+  std::lock_guard<std::shared_mutex> lock(mux);
+  if (auto it = cache.find(ext); it != cache.end()) {
+    return it->second;
+  }
+  std::shared_ptr<const OclModule> ptr(
+      new OclModule(OclRuntime(ext), !staticMain.empty(), main, argTypes,
+                    std::move(eng.get())));
+  return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr)
+      .first->second;
+}
+} // namespace mlir::gc::gpu
\ No newline at end of file
diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt
index 3909681e3..47d3899bb 100644
--- a/lib/gc/Transforms/GPU/CMakeLists.txt
+++ b/lib/gc/Transforms/GPU/CMakeLists.txt
@@ -1,3 +1,15 @@
+include(imex)
+get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
+get_property(IMEX_DIALECT_LIBS GLOBAL PROPERTY IMEX_DIALECT_LIBS)
+get_property(IMEX_CONVERSION_LIBS GLOBAL PROPERTY IMEX_CONVERSION_LIBS)
+set(IMEX_LIBS
+  ${IMEX_CONVERSION_LIBS}
+  IMEXUtil
+  IMEXTransforms
+  ${IMEX_DIALECT_LIBS}
+)
+set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
+
 gc_add_mlir_library(GcGpuPasses
   AddContextArg.cpp
   GpuToGpuOcl.cpp
@@ -19,9 +31,10 @@ gc_add_mlir_library(GcGpuPasses
     MLIRMemRefTransforms
     GcInterface
     GcUtilsIR
+    ${IMEX_LIBS}
 )
 
-include(imex)
-get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
-target_include_directories(GcGpuPasses PRIVATE ${IMEX_INCLUDES})
-
+foreach (include_dir IN LISTS IMEX_INCLUDES)
+  list(APPEND IMEX_BUILD_INCLUDES $<BUILD_INTERFACE:${include_dir}>)
+endforeach ()
+target_include_directories(GcGpuPasses PUBLIC ${IMEX_BUILD_INCLUDES})
diff --git a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp
index f765d29cf..3bad9fc0f 100644
--- a/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp
+++ b/lib/gc/Transforms/GPU/GpuToGpuOcl.cpp
@@ -124,11 +124,13 @@ struct ConvertAlloc final : ConvertOpPattern<gpu::AllocOp> {
   LogicalResult
   matchAndRewrite(gpu::AllocOp allocOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    auto fnName =
+        allocOp.getHostShared() ? GPU_OCL_MALLOC_SHARED : GPU_OCL_MALLOC_DEV;
     auto loc = allocOp.getLoc();
     MemRefType type = allocOp.getType();
 
     if (auto staticSize = helper.calculateStaticSize(rewriter, loc, type)) {
-      auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType,
+      auto ptr = funcCall(rewriter, fnName, helper.ptrType,
                           {helper.ptrType, helper.idxType}, loc,
                           {getCtxPtr(rewriter), staticSize})
                      .getResult();
@@ -151,7 +153,7 @@ struct ConvertAlloc final : ConvertOpPattern<gpu::AllocOp> {
                              shape, strides, size);
     assert(shape.size() == strides.size());
 
-    auto ptr = funcCall(rewriter, GPU_OCL_MALLOC, helper.ptrType,
+    auto ptr = funcCall(rewriter, fnName, helper.ptrType,
                         {helper.ptrType, helper.idxType}, loc,
                         {getCtxPtr(rewriter), size})
                    .getResult();
@@ -224,7 +226,8 @@ struct ConvertMemcpy final : ConvertOpPattern<gpu::MemcpyOp> {
 
 struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
 
-  explicit ConvertLaunch(const Helper &helper) : ConvertOpPattern(helper) {}
+  explicit ConvertLaunch(const Helper &helper, bool callFinish)
+      : ConvertOpPattern(helper), callFinish(callFinish) {}
 
   LogicalResult
   matchAndRewrite(gpu::LaunchFuncOp gpuLaunch, OpAdaptor adaptor,
@@ -261,10 +264,17 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
         funcCall(rewriter, GPU_OCL_KERNEL_LAUNCH, helper.voidType,
                  {helper.ptrType, helper.ptrType}, loc, args, true);
     rewriter.replaceOp(gpuLaunch, gpuOclLaunch);
+
+    if (callFinish) {
+      funcCall(rewriter, GPU_OCL_FINISH, helper.voidType, {helper.ptrType}, loc,
+               getCtxPtr(rewriter));
+    }
+
     return success();
   }
 
 private:
+  bool callFinish;
   // Returns the kernel pointer stored in the global var ...name_Ptr.
   // If it's NULL, calls the createKernel() function.
   std::optional<Value> getKernel(gpu::LaunchFuncOp &gpuLaunch,
@@ -377,8 +387,9 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
 
     auto ptr = mod.lookupSymbol<LLVM::GlobalOp>(str("Ptr"));
     assert(ptr);
-    SmallVector<char> nameChars(kernelModName.getValue().begin(),
-                                kernelModName.getValue().end());
+    auto kernelName = gpuLaunch.getKernelName();
+    SmallVector<char> nameChars(kernelName.getValue().begin(),
+                                kernelName.getValue().end());
     nameChars.emplace_back('\0');
     // Kernel name and SPIRV are stored as global strings
     auto name = LLVM::createGlobalString(
@@ -495,6 +506,10 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
 
 struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase<GpuToGpuOcl> {
 
+  explicit GpuToGpuOcl() : GpuToGpuOcl(gc::GpuToGpuOclOptions{}) {}
+  explicit GpuToGpuOcl(const gc::GpuToGpuOclOptions &opts)
+      : GpuToGpuOclBase(opts) {}
+
   void runOnOperation() override {
     const auto ctx = &getContext();
     const LLVMConversionTarget target(getContext());
@@ -503,8 +518,9 @@ struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase<GpuToGpuOcl> {
     RewritePatternSet patterns(ctx);
 
     populateGpuToLLVMConversionPatterns(converter, patterns);
-    patterns.insert<ConvertAlloc, ConvertMemcpy, ConvertLaunch, ConvertDealloc>(
-        helper);
+    patterns.insert<ConvertAlloc, ConvertMemcpy>(helper);
+    patterns.insert<ConvertLaunch>(helper, callFinish);
+    patterns.insert<ConvertDealloc>(helper);
 
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {
diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp
index d7bc69e13..c1c3735aa 100644
--- a/lib/gc/Transforms/GPU/Pipeline.cpp
+++ b/lib/gc/Transforms/GPU/Pipeline.cpp
@@ -6,49 +6,33 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <string>
+
+#include "gc/Transforms/Passes.h"
+
+#include "imex/Conversion/Passes.h"
+#include "imex/Transforms/Passes.h"
+
 #include "mlir/Conversion/Passes.h"
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/Math/Transforms/Passes.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/Passes.h"
-#include <iostream>
-
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
-
-#include <imex/Conversion/Passes.h>
-#include <imex/Transforms/Passes.h>
-
-#include <string>
-
-#include "gc/Transforms/Passes.h"
 
 namespace mlir::gc {
 
-struct GPUPipelineOption : PassPipelineOptions<GPUPipelineOption> {
-  PassOptions::Option<bool> isUsmArgs{
-      *this, "is-usm-args",
-      llvm::cl::desc("Whether to use USM(unified shared memory) func args, in "
-                     "which the host and device could access the same buffer "
-                     "and there is no need to add memcpy explicitly"),
-      llvm::cl::init(true)};
-};
-
-void populateGPUPipeline(mlir::OpPassManager &pm,
+void populateGPUPipeline(OpPassManager &pm,
                          const GPUPipelineOption &pipelineOption) {
+  // Add an argument for the GPU context
+  pm.addNestedPass<func::FuncOp>(createAddContextArg());
+
   pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
 
   pm.addPass(bufferization::createEmptyTensorEliminationPass());
@@ -91,6 +75,7 @@ void populateGPUPipeline(mlir::OpPassManager &pm,
       /*isUsmArgs*/ pipelineOption.isUsmArgs.getValue()};
   pm.addNestedPass<func::FuncOp>(
       imex::createInsertGPUAllocsPass(insertGPUAllocsOption));
+
   pm.addPass(createGpuKernelOutliningPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(imex::createSetSPIRVCapabilitiesPass());
@@ -109,7 +94,6 @@ void populateGPUPipeline(mlir::OpPassManager &pm,
   pm.addNestedPass<func::FuncOp>(LLVM::createRequestCWrappersPass());
   pm.addPass(imex::createSerializeSPIRVPass());
   pm.addPass(createConvertVectorToSCFPass());
-  pm.addPass(imex::createConvertGPUToGPUXPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertControlFlowToLLVMPass());
   pm.addPass(createConvertVectorToLLVMPass());
@@ -117,7 +101,7 @@ void populateGPUPipeline(mlir::OpPassManager &pm,
   pm.addPass(createArithToLLVMConversionPass());
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(createConvertMathToLLVMPass());
-  pm.addPass(imex::createConvertGPUXToLLVMPass());
+  pm.addPass(createGpuToGpuOcl({pipelineOption.callFinish}));
   pm.addPass(createConvertIndexToLLVMPass());
   pm.addPass(memref::createExpandStridedMetadataPass());
   pm.addPass(createLowerAffinePass());
diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt
index 44de22b32..d6a9b6560 100644
--- a/src/gc-opt/CMakeLists.txt
+++ b/src/gc-opt/CMakeLists.txt
@@ -49,7 +49,6 @@ target_link_libraries(gc-opt PRIVATE
 
 if(GC_ENABLE_IMEX)
   include(imex)
-  target_compile_options(gc-opt PRIVATE -DGC_USE_IMEX)
   get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
   target_include_directories(gc-opt PRIVATE ${IMEX_INCLUDES})
   target_link_libraries(gc-opt PRIVATE
diff --git a/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir b/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir
index 8a5571571..0c3998b5d 100644
--- a/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir
+++ b/test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir
@@ -68,7 +68,7 @@ module @test attributes {gpu.container_module} {
 
 // CHECK: llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, [[CTX:%.+]]: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64)
 // CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
-// CHECK: llvm.call @gcGpuOclMalloc([[CTX]], [[SIZE]])
+// CHECK: llvm.call @gcGpuOclMallocShared([[CTX]], [[SIZE]])
 // CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
 // CHECK: [[SRC:%.+]] = llvm.extractvalue
 // CHECK: [[DST:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
diff --git a/test/mlir/test/gc/gpu-runner/lit.local.cfg b/test/mlir/test/gc/gpu-runner/lit.local.cfg
index f180dd41b..6dce3e36e 100644
--- a/test/mlir/test/gc/gpu-runner/lit.local.cfg
+++ b/test/mlir/test/gc/gpu-runner/lit.local.cfg
@@ -1,2 +1,4 @@
 if not config.gc_use_imex:
-    config.unsupported = True
\ No newline at end of file
+    config.unsupported = True
+else: # FIXME: Remove this when the GPU runner is implemented.
+    config.unsupported = True
diff --git a/test/mlir/unittests/ExecutionEngine/CMakeLists.txt b/test/mlir/unittests/ExecutionEngine/CMakeLists.txt
index 9f5747b58..9c28d75d4 100644
--- a/test/mlir/unittests/ExecutionEngine/CMakeLists.txt
+++ b/test/mlir/unittests/ExecutionEngine/CMakeLists.txt
@@ -9,5 +9,9 @@ target_include_directories(GCExecutionEngineTests PRIVATE ${GC_DNNL_INCLUDES})
 target_link_libraries(GCExecutionEngineTests
   PRIVATE
   GcJitWrapper
-  GcCpuRuntime)
+  GcCpuRuntime
+)
 
+if(GC_ENABLE_IMEX)
+  add_subdirectory(GPU)
+endif()
diff --git a/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt b/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt
new file mode 100644
index 000000000..1aefee321
--- /dev/null
+++ b/test/mlir/unittests/ExecutionEngine/GPU/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_mlir_unittest(GCExecutionEngineGpuTests
+  GpuOclRuntimeTest.cpp
+)
+target_link_libraries(GCExecutionEngineGpuTests
+  PRIVATE
+  GcJitWrapper
+  GcGpuOclRuntime
+)
diff --git a/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp b/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp
new file mode 100644
index 000000000..9d7b18bca
--- /dev/null
+++ b/test/mlir/unittests/ExecutionEngine/GPU/GpuOclRuntimeTest.cpp
@@ -0,0 +1,232 @@
+//===-- GpuOclRuntime.cpp - -------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/cl_ext.h>
+#include <memory>
+
+#include "gc/ExecutionEngine/Driver/Driver.h"
+#include "gc/ExecutionEngine/GPURuntime/GpuOclRuntime.h"
+#include "gc/Utils/Error.h"
+
+#include "mlir/ExecutionEngine/MemRefUtils.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace gc::gpu;
+
+constexpr char addStatic[] = R"mlir(
+module @test {
+  func.func @entry(%arg0: memref<64x64xf32>, %arg1: memref<64x64xf32>, %arg2: memref<64x64xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<64x64xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<64x64xf32>
+    %2 = tensor.empty() : tensor<64x64xf32>
+    %3 = linalg.add ins(%1, %0 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2 : tensor<64x64xf32>) -> tensor<64x64xf32>
+    bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<64x64xf32>, memref<64x64xf32>) -> ()
+    return
+  }
+}
+)mlir";
+
+constexpr char addDynamic[] = R"mlir(
+module @test {
+  func.func @entry(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = bufferization.to_tensor %arg0 restrict : memref<?x?xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<?x?xf32>
+    %d0 = memref.dim %arg0, %c0 : memref<?x?xf32>
+    %d1 = memref.dim %arg0, %c1 : memref<?x?xf32>
+    %2 = tensor.empty(%d0, %d1) : tensor<?x?xf32>
+    %3 = linalg.add ins(%1, %0 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+    bufferization.materialize_in_destination %3 in restrict writable %arg2 : (tensor<?x?xf32>, memref<?x?xf32>) -> ()
+    return
+  }
+}
+)mlir";
+
+constexpr char matmulAddStatic[] = R"mlir(
+module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>>>} {
+  func.func @entry(%arg0: memref<64x128xf32>, %arg1: memref<128x128xf32>, %arg2: memref<64x128xf32>) {
+    %0 = bufferization.to_tensor %arg0 restrict : memref<64x128xf32>
+    %1 = bufferization.to_tensor %arg1 restrict : memref<128x128xf32>
+    %2 = tensor.empty() : tensor<64x128xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<64x128xf32>) -> tensor<64x128xf32>
+    %4 = linalg.matmul_transpose_b ins(%0, %1 : tensor<64x128xf32>, tensor<128x128xf32>) outs(%3 : tensor<64x128xf32>) -> tensor<64x128xf32>
+    %5 = tensor.empty() : tensor<64x128xf32>
+    %6 = linalg.add ins(%4, %0 : tensor<64x128xf32>, tensor<64x128xf32>) outs(%5 : tensor<64x128xf32>) -> tensor<64x128xf32>
+    bufferization.materialize_in_destination %6 in restrict writable %arg2 : (tensor<64x128xf32>, memref<64x128xf32>) -> ()
+    return
+  }
+}
+)mlir";
+
+struct TestBase {
+  OclRuntime runtime = gcGetOrReport(OclRuntime::get());
+  cl_command_queue queue = gcGetOrReport(runtime.createQueue());
+  OclContext ctx{runtime, queue};
+  MLIRContext mlirCtx{gc::initCompilerAndGetDialects()};
+
+  virtual void exec(std::shared_ptr<const OclModule> &mod) = 0;
+
+  virtual ~TestBase() { gcGetOrReport(runtime.releaseQueue(queue)); }
+
+  OwningOpRef<ModuleOp> parse(const char *code) {
+    std::unique_ptr<llvm::MemoryBuffer> memBuf =
+        llvm::MemoryBuffer::getMemBuffer(code);
+    llvm::SourceMgr srcMgr;
+    srcMgr.AddNewSourceBuffer(std::move(memBuf), SMLoc());
+    return parseSourceFile<ModuleOp>(srcMgr, &mlirCtx);
+  }
+};
+
+template <unsigned N, unsigned M = N> struct TestAdd : TestBase {
+  static constexpr unsigned size = N * M;
+  float *buf0 = gcGetOrReport(runtime.usmNewDev<float>(size));
+  float *buf1 = gcGetOrReport(runtime.usmNewDev<float>(size));
+  float *buf2 = gcGetOrReport(runtime.usmNewShared<float>(size));
+
+  explicit TestAdd() {
+    float cpuBuf[size];
+    std::fill(cpuBuf, cpuBuf + size, 2.0f);
+    assert(runtime.usmCpy(ctx, cpuBuf, buf0, size));
+    assert(runtime.usmCpy(ctx, cpuBuf, buf1, size));
+    gcGetOrReport(ctx.finish());
+  }
+
+  ~TestAdd() override {
+    assert(runtime.usmFree(buf0));
+    assert(runtime.usmFree(buf1));
+    assert(runtime.usmFree(buf2));
+  }
+
+  void test(const char *code) {
+    OclModuleBuilder builder(parse(code));
+    auto mod = gcGetOrReport(builder.build(runtime));
+    exec(mod);
+
+    float cpuBuf[size];
+    assert(runtime.usmCpy(ctx, buf2, cpuBuf, size));
+    gcGetOrReport(ctx.finish());
+
+    for (unsigned i = 0; i < size; i++) {
+      // std::cout << buf2[i] << " ";
+      assert(buf2[i] == 4.0f);
+    }
+    // std::cout << "\n";
+
+    for (float i : cpuBuf) {
+      // std::cout << i << " ";
+      assert(i == 4.0f);
+    }
+  }
+};
+
+template <unsigned N, unsigned M = N> struct TestMatmulAdd : TestBase {
+  static constexpr unsigned size1 = N * M;
+  static constexpr unsigned size2 = M * M;
+  float *buf0 = gcGetOrReport(runtime.usmNewDev<float>(size1));
+  float *buf1 = gcGetOrReport(runtime.usmNewDev<float>(size2));
+  float *buf2 = gcGetOrReport(runtime.usmNewShared<float>(size1));
+
+  explicit TestMatmulAdd() {
+    float cpuBuf[size2];
+    std::fill(cpuBuf, cpuBuf + size2, 2);
+    assert(runtime.usmCpy(ctx, cpuBuf, buf0, size1));
+    assert(runtime.usmCpy(ctx, cpuBuf, buf1, size2));
+    gcGetOrReport(ctx.finish());
+  }
+
+  ~TestMatmulAdd() override {
+    assert(runtime.usmFree(buf0));
+    assert(runtime.usmFree(buf1));
+    assert(runtime.usmFree(buf2));
+  }
+
+  void test(const char *code) {
+    OclModuleBuilder builder(parse(code));
+    auto mod = gcGetOrReport(builder.build(runtime));
+    exec(mod);
+
+    gcGetOrReport(ctx.finish());
+    for (unsigned i = 0; i < size1; i++) {
+      // std::cout << buf2[i] << " ";
+      assert(buf2[i] == 514);
+    }
+    // std::cout << "\n";
+  }
+};
+
+TEST(GpuOclRuntime, TestAddStatic) {
+  struct TestAddStatic1 : TestAdd<64> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
+      assert(mod->isStatic);
+      StaticExecutor<3> exec(mod);
+      exec(ctx, buf0, buf1, buf2);
+      // Check if the executor is allocated on the stack
+      assert(exec.isSmall());
+    }
+  } test1;
+  test1.test(addStatic);
+
+  struct TestAddStatic2 : TestAdd<64> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
+      assert(mod->isStatic);
+      StaticExecutor<3> exec(mod);
+      exec.arg(buf0);
+      exec.arg(buf1);
+      exec.arg(buf2);
+      // Check if the executor is allocated on the stack
+      assert(exec.isSmall());
+      exec(ctx);
+    }
+  } test2;
+  test2.test(addStatic);
+}
+
+TEST(GpuOclRuntime, TestAddDynamic) {
+  GTEST_SKIP() << "Dynamic shapes are not yet supported";
+  struct TestAddDynamic : TestAdd<32, 64> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
+      assert(!mod->isStatic);
+      int64_t shape[] = {32, 64};
+      int64_t strides[] = {64, 1};
+      DynamicExecutor<24> exec(mod);
+      exec.arg(buf0, 2, shape, strides);
+      exec.arg(buf1, 2, shape, strides);
+      exec.arg(buf2, 2, shape, strides);
+      exec(ctx);
+      // Check if the executor is allocated on the stack
+      assert(exec.isSmall());
+    }
+  } test;
+  test.test(addDynamic);
+}
+
+TEST(GpuOclRuntime, TestMatmulAddStatic) {
+  struct Test : TestMatmulAdd<64, 128> {
+    void exec(std::shared_ptr<const OclModule> &mod) override {
+      assert(mod->isStatic);
+      StaticExecutor<3> exec(mod);
+      exec(ctx, buf0, buf1, buf2);
+      assert(exec.isSmall());
+    }
+  } test;
+  test.test(matmulAddStatic);
+}