Skip to content

Commit

Permalink
SPIR-V GEMM 4K test case.
Browse files Browse the repository at this point in the history
Update Level Zero and SYCL runtime to add large(double) number of GRFs option.
Add performance report.

Co-Authored-By: Gusthinna Waduge, Charitha Saumya <[email protected]>
  • Loading branch information
2 people authored and silee2 committed Nov 17, 2023
1 parent 5b8a5a2 commit 40461b6
Show file tree
Hide file tree
Showing 7 changed files with 901 additions and 3 deletions.
3 changes: 3 additions & 0 deletions include/imex/ExecutionEngine/ImexRunnerUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ _mlir_ciface_fillResource1DF16(MemRefDescriptor<f16, 1> *ptr, // NOLINT
extern "C" IMEX_RUNNERUTILS_EXPORT void
_mlir_ciface_fillResource1DF32(MemRefDescriptor<float, 1> *ptr, // NOLINT
float value);
extern "C" IMEX_RUNNERUTILS_EXPORT void
_mlir_ciface_fillMatrixRandomBF16(MemRefDescriptor<bf16, 1> *ptr, // NOLINT
int nrows, int ncols);

extern "C" IMEX_RUNNERUTILS_EXPORT void
_mlir_ciface_printMemrefBF16(UnrankedMemRefType<bf16> *m);
Expand Down
18 changes: 17 additions & 1 deletion lib/ExecutionEngine/ImexRunnerUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <random>
#include <string>

// NOLINTBEGIN(*-identifier-naming)
Expand All @@ -37,6 +38,21 @@ _mlir_ciface_fillResource1DF16(MemRefDescriptor<f16, 1> *ptr, // NOLINT
std::fill_n(ptr->allocated, ptr->sizes[0], f16_val);
}

/// Fills the given 2D memref (i.e. matrix) passed as 1D memref
// with randomly generated values. Numbers of rows and cols needs to be
// specified and strides are assumed to be [ncols, 1]
extern "C" void
_mlir_ciface_fillMatrixRandomBF16(MemRefDescriptor<bf16, 1> *ptr, // NOLINT
int nrows, int ncols) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dist(-0.5f, 0.5f);

for (int i = 0; i < nrows * ncols; i++) {
ptr->allocated[i] = dist(gen);
}
}

/// Fills the given 1D float (f32) memref with the given float value.
extern "C" void
_mlir_ciface_fillResource1DF32(MemRefDescriptor<float, 1> *ptr, // NOLINT
Expand Down Expand Up @@ -143,7 +159,7 @@ extern "C" bool _mlir_ciface_allcloseBF16(UnrankedMemRefType<bf16> *M,
// https://numpy.org/doc/stable/reference/generated/numpy.allclose.html
// values may need to adjusted in the future
const float atol = 1e-08;
const float rtol = 1e-05;
const float rtol = 1e-01;
DynamicMemRefType<bf16> DM = DynamicMemRefType<bf16>(*M);
DynamicMemRefType<float> DN = DynamicMemRefType<float>(*N);
DynamicMemRefIterator<bf16> i = DM.begin();
Expand Down
11 changes: 11 additions & 0 deletions lib/ExecutionEngine/LEVELZERORUNTIME/LevelZeroRuntimeWrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,9 +385,20 @@ static ze_module_handle_t loadModule(GPUL0QUEUE *queue, const void *data,
return it->second.module;
}
ze_module_desc_t desc = {};

const char *build_flags = nullptr;
// enable large register file if needed
if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) {
build_flags =
"-vc-codegen -doubleGRF -Xfinalizer -noLocalSplit -Xfinalizer "
"-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
"'-printregusage -enableBCR' ";
}

desc.format = ZE_MODULE_FORMAT_IL_SPIRV;
desc.pInputModule = static_cast<const uint8_t *>(data);
desc.inputSize = dataSize;
desc.pBuildFlags = build_flags;
CHECK_ZE_RESULT(zeModuleCreate(gpuL0Queue->zeContext_, gpuL0Queue->zeDevice_,
&desc, &zeModule, nullptr));
std::lock_guard<std::mutex> entryLock(mutexLock);
Expand Down
14 changes: 13 additions & 1 deletion lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ static ze_module_handle_t loadModule(GPUSYCLQUEUE *queue, const void *data,
(const uint8_t *)data,
nullptr,
nullptr};

// enable large register file if needed
if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) {
desc.pBuildFlags =
"-vc-codegen -doubleGRF -Xfinalizer -noLocalSplit -Xfinalizer "
"-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
"'-printregusage -enableBCR' ";
}

auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
syclQueue.get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
Expand Down Expand Up @@ -279,12 +288,15 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,

// warmups
for (int r = 0; r < warmups; r++) {
enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
auto e =
enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
e.wait();
}

for (int r = 0; r < rounds; r++) {
sycl::event event =
enqueueKernel(syclQueue, kernel, syclNdRange, params, sharedMemBytes);
event.wait();

auto startTime = event.get_profiling_info<
cl::sycl::info::event_profiling::command_start>();
Expand Down
Loading

0 comments on commit 40461b6

Please sign in to comment.