From ea2ddcf4d8189b051978d4d795da7c09d1a8c7bc Mon Sep 17 00:00:00 2001
From: Md Abdullah Shahneous Bari <98356296+mshahneo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:41:48 -0500
Subject: [PATCH] [Conversion][MathToVC] Add MathToVC pass. (#943)

Convert selective math ops to vc-intrinsics based on condition.

Currently supported ops:
math.ceil,
math.floor,
math.exp (only for vectors),
math.exp2 (only for vectors).

The pass provides an option for high-precision interim calculation.
Exp op conversion requires multuplication of input to log2e, this option,
allows these operations to be done in f32 if the input data type is lower precsion.
Default is to do these operation in actual precision of the exp input.
---
 include/imex/Conversion/CMakeLists.txt        |   1 +
 .../imex/Conversion/MathToVC/CMakeLists.txt   |   0
 include/imex/Conversion/MathToVC/MathToVC.h   |  51 +++
 include/imex/Conversion/Passes.h              |   1 +
 include/imex/Conversion/Passes.td             |  30 ++
 .../Utils.h => include/imex/Utils/VCUtils.h   |  12 +-
 include/imex/Utils/XeCommon.h                 |   6 +
 lib/Conversion/CMakeLists.txt                 |   1 +
 lib/Conversion/MathToVC/CMakeLists.txt        |  21 ++
 lib/Conversion/MathToVC/MathToVC.cpp          | 318 ++++++++++++++++++
 lib/Conversion/XeGPUToVC/CMakeLists.txt       |   1 -
 lib/Conversion/XeGPUToVC/LSCPatterns.cpp      |   2 +-
 lib/Conversion/XeGPUToVC/XeGPUToVC.cpp        |  20 +-
 lib/Utils/CMakeLists.txt                      |   1 +
 .../XeGPUToVC/Utils.cpp => Utils/VCUtils.cpp} |   5 +-
 lib/Utils/XeCommon.cpp                        |  14 +-
 test/Conversion/MathToVC/math-to-vc.mlir      |  78 +++++
 test/Conversion/XeGPUToVC/eltwise.mlir        |   6 +-
 .../Dialect/Gpu/ceil_floor_BF16.mlir          |  53 +++
 test/Integration/Dialect/Gpu/gpu-to-llvm.pp   |   1 +
 .../Dialect/XeGPU/ceil_floor_f32.mlir         |  84 +++++
 .../Dialect/XeGPU/xegpu-to-func-vc.pp         |   1 +
 22 files changed, 682 insertions(+), 25 deletions(-)
 create mode 100644 include/imex/Conversion/MathToVC/CMakeLists.txt
 create mode 100644 include/imex/Conversion/MathToVC/MathToVC.h
 rename lib/Conversion/XeGPUToVC/Utils.h => include/imex/Utils/VCUtils.h (89%)
 create mode 100644 lib/Conversion/MathToVC/CMakeLists.txt
 create mode 100644 lib/Conversion/MathToVC/MathToVC.cpp
 rename lib/{Conversion/XeGPUToVC/Utils.cpp => Utils/VCUtils.cpp} (95%)
 create mode 100644 test/Conversion/MathToVC/math-to-vc.mlir
 create mode 100644 test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir
 create mode 100644 test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir

diff --git a/include/imex/Conversion/CMakeLists.txt b/include/imex/Conversion/CMakeLists.txt
index db25c8fc0..635dcd12f 100644
--- a/include/imex/Conversion/CMakeLists.txt
+++ b/include/imex/Conversion/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(DistToStandard)
 add_subdirectory(DropRegions)
 add_subdirectory(XeTileToXeGPU)
 add_subdirectory(XeGPUToVC)
+add_subdirectory(MathToVC)
diff --git a/include/imex/Conversion/MathToVC/CMakeLists.txt b/include/imex/Conversion/MathToVC/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/include/imex/Conversion/MathToVC/MathToVC.h b/include/imex/Conversion/MathToVC/MathToVC.h
new file mode 100644
index 000000000..7128cd874
--- /dev/null
+++ b/include/imex/Conversion/MathToVC/MathToVC.h
@@ -0,0 +1,51 @@
+//===- MathToVC.h - Conversion---------------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements conversion of the select math dialect operations into
+/// Func dialect calls to vc-intrinsics functions
+///
+//===----------------------------------------------------------------------===//
+#ifndef IMEX_CONVERSION_MATHTOVC_H
+#define IMEX_CONVERSION_MATHTOVC_H
+
+#include <mlir/Dialect/Math/IR/Math.h>
+#include <mlir/Dialect/Vector/IR/VectorOps.h>
+
+#include "imex/Utils/XeCommon.h"
+
+namespace mlir {
+
+class ConversionTarget;
+class LLVMTypeConverter;
+class Pass;
+class Operation;
+class RewritePatternSet;
+template <typename T> class OperationPass;
+
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
+
+} // namespace mlir
+
+namespace imex {
+#define GEN_PASS_DECL_CONVERTMATHTOVC
+#include "imex/Conversion/Passes.h.inc"
+
+void populateMathToVCPatterns(
+    ::mlir::LLVMTypeConverter &typeConverter,
+    ::mlir::RewritePatternSet &patterns,
+    bool enableHighPrecisionInterimCalculation = false);
+void configureMathToVCConversionLegality(::mlir::ConversionTarget &target);
+std::unique_ptr<::mlir::OperationPass<::mlir::gpu::GPUModuleOp>>
+createConvertMathToVCPass();
+
+} // namespace imex
+#endif
diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h
index d466ba422..0cb9c8c81 100644
--- a/include/imex/Conversion/Passes.h
+++ b/include/imex/Conversion/Passes.h
@@ -22,6 +22,7 @@
 #include <imex/Conversion/GPUToGPUX/GPUToGPUX.h>
 #include <imex/Conversion/GPUToSPIRV/GPUToSPIRVPass.h>
 #include <imex/Conversion/GPUXToLLVM/GPUXToLLVMPass.h>
+#include <imex/Conversion/MathToVC/MathToVC.h>
 #include <imex/Conversion/NDArrayToLinalg/NDArrayToLinalg.h>
 #include <imex/Conversion/XeGPUToVC/XeGPUToVC.h>
 #include <imex/Conversion/XeTileToXeGPU/XeTileToXeGPU.h>
diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td
index 40b089050..1ad939d4a 100644
--- a/include/imex/Conversion/Passes.td
+++ b/include/imex/Conversion/Passes.td
@@ -427,4 +427,34 @@ def ConvertXeGPUToVC : Pass<"convert-xegpu-to-vc", "::mlir::gpu::GPUModuleOp"> {
   let constructor = "imex::createConvertXeGPUToVCPass()";
 }
 
+
+//===----------------------------------------------------------------------===//
+// MathToVC
+//===----------------------------------------------------------------------===//
+// high-precision-interim-calculation
+def ConvertMathToVC : Pass<"convert-math-to-vc", "::mlir::gpu::GPUModuleOp"> {
+  let summary = "Generate vc-intrinsics functions for select math dialect operations";
+  let description = [{
+    Convert select math dialect operations into the Func dialect calls to vc-intrinsics
+    functions.
+    Some math operations are not supported by the VC compiler (IGC vector backend)
+    and need to be converted to vc-intrinsic calls.
+    This pass converts these math operations to vc-intrinsics.
+    }];
+  let options = [
+    Option<"enableHighPrecisionInterimCalculation", "enable-high-precision-interim-calculation", "bool",
+           /*default=*/"false",
+           "Enables high precision (f32) interim  calculation for math operations."
+           "For any interim instruction added as part of the conversion will be high precision(f32).">
+  ];
+
+  let dependentDialects = ["::mlir::math::MathDialect",
+                           "::mlir::vector::VectorDialect",
+                            "::mlir::LLVM::LLVMDialect",
+                            "::mlir::func::FuncDialect",
+                           "::mlir::arith::ArithDialect"
+                            ];
+  let constructor = "imex::createConvertMathToVCPass()";
+}
+
 #endif // _IMEX_CONVERSION_PASSES_TD_INCLUDED_
diff --git a/lib/Conversion/XeGPUToVC/Utils.h b/include/imex/Utils/VCUtils.h
similarity index 89%
rename from lib/Conversion/XeGPUToVC/Utils.h
rename to include/imex/Utils/VCUtils.h
index 0f9c06fb9..20ac41811 100644
--- a/lib/Conversion/XeGPUToVC/Utils.h
+++ b/include/imex/Utils/VCUtils.h
@@ -8,14 +8,20 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines some utils used in XeGPUToVC pass
+/// This file defines some utils used in ConversionToVC passes
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef XEGPU_VC_UTILS_H
-#define XEGPU_VC_UTILS_H
+#ifndef VC_UTILS_H
+#define VC_UTILS_H
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
 
diff --git a/include/imex/Utils/XeCommon.h b/include/imex/Utils/XeCommon.h
index d14603e10..f2d08884a 100644
--- a/include/imex/Utils/XeCommon.h
+++ b/include/imex/Utils/XeCommon.h
@@ -617,6 +617,12 @@ llvm::SmallVector<T> swapLastTwoElements(llvm::ArrayRef<T> shape) {
 ///   output strides = 60x20x5x1
 llvm::SmallVector<int64_t> defaultStrides(llvm::ArrayRef<int64_t> shape);
 
+/// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL
+/// capability. In other words, the vector size is not supported by SPIR-V.
+/// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16
+/// capability).
+bool isVectorAnyINTELType(mlir::Type type);
+
 } // namespace imex
 
 #endif
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 813781dd3..b52337e57 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -4,5 +4,6 @@ add_subdirectory(DropRegions)
 add_subdirectory(GPUToSPIRV)
 add_subdirectory(GPUToGPUX)
 add_subdirectory(GPUXToLLVM)
+add_subdirectory(MathToVC)
 add_subdirectory(XeTileToXeGPU)
 add_subdirectory(XeGPUToVC)
diff --git a/lib/Conversion/MathToVC/CMakeLists.txt b/lib/Conversion/MathToVC/CMakeLists.txt
new file mode 100644
index 000000000..ab5d76327
--- /dev/null
+++ b/lib/Conversion/MathToVC/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_imex_conversion_library(IMEXMathToVC
+  MathToVC.cpp
+
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/MathToVC
+
+  DEPENDS
+  IMEXConversionPassIncGen
+
+  #LINK_COMPONENTS
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRSupport
+  # MLIRTransforms
+  MLIRLLVMCommonConversion
+
+  MLIRGPUDialect
+  MLIRPass
+  )
diff --git a/lib/Conversion/MathToVC/MathToVC.cpp b/lib/Conversion/MathToVC/MathToVC.cpp
new file mode 100644
index 000000000..0e0121c91
--- /dev/null
+++ b/lib/Conversion/MathToVC/MathToVC.cpp
@@ -0,0 +1,318 @@
+//===- MathToVC.cpp - Conversion---------------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements conversion of the select math dialect operations into
+/// Func dialect calls to vc-intrinsics functions
+///
+//===----------------------------------------------------------------------===//
+
+#include "imex/Conversion/MathToVC/MathToVC.h"
+#include "imex/Utils/VCUtils.h"
+#include "imex/Utils/XeCommon.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace imex {
+#define GEN_PASS_DEF_CONVERTMATHTOVC
+#include "imex/Conversion/Passes.h.inc"
+} // namespace imex
+
+using namespace mlir;
+using namespace imex;
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+// Get the VC intrinsic name for the given math operation
+template <typename MOp> std::string getVCIntrinsicName() {
+  constexpr bool isCeilOp = std::is_same_v<MOp, math::CeilOp>;
+  constexpr bool isFloorOp = std::is_same_v<MOp, math::FloorOp>;
+  constexpr bool isExpOp = std::is_same_v<MOp, math::ExpOp>;
+  constexpr bool isExp2Op = std::is_same_v<MOp, math::Exp2Op>;
+  if (isCeilOp)
+    return "llvm.genx.rndu.";
+  else if (isFloorOp)
+    return "llvm.genx.rndd.";
+  else if (isExpOp || isExp2Op)
+    return "llvm.genx.exp.";
+  else
+    assert(0 && "Unsupported math Op. Add more support!");
+}
+
+// Utility function to convert a scalar or vector type of any float bitwidth to
+// another.
+Type convertScalarOrVectorFloatType(Type srcType, Type dstElementType) {
+  // get a vector type or scalar type of dstElementType with the same shape as
+  // srcType
+  if (auto vecTy = dyn_cast<VectorType>(srcType)) {
+    auto newTy = VectorType::get(vecTy.getShape(), dstElementType);
+    return newTy;
+  } else if (auto scalarTy = dyn_cast<FloatType>(srcType)) {
+    return dstElementType;
+  } else {
+    assert(0 && "Unsupported type");
+  }
+}
+
+// Utility function to convert a range float args to a specific float type
+// The function converts the float args to the dstElementType
+// It generates an extension or truncation operation if the bitwidth of the src
+// and dst types are different
+SmallVector<Value> convertFloatArgsType(SmallVector<Value> args,
+                                        Type dstElementType,
+                                        ConversionPatternRewriter &rewriter) {
+  SmallVector<Value> newArgs;
+  auto dstBitWidth = dstElementType.getIntOrFloatBitWidth();
+  for (auto arg : args) {
+    // Assert if src and dst types are not float types
+    assert(((isa<FloatType>(arg.getType()) ||
+             isa<FloatType>(
+                 dyn_cast<VectorType>(arg.getType()).getElementType())) &&
+            isa<FloatType>(dstElementType)) &&
+           "Unsupported type, src and dst both should be float types");
+    auto srcBitWidth =
+        dyn_cast<VectorType>(arg.getType())
+            ? dyn_cast<VectorType>(arg.getType()).getElementTypeBitWidth()
+            : arg.getType().getIntOrFloatBitWidth();
+
+    if (srcBitWidth == dstBitWidth)
+      newArgs.push_back(arg);
+    else if (srcBitWidth < dstBitWidth) {
+      auto newType =
+          convertScalarOrVectorFloatType(arg.getType(), dstElementType);
+      auto newOp = rewriter.create<arith::ExtFOp>(arg.getLoc(), newType, arg);
+      newArgs.push_back(newOp);
+    } else if (srcBitWidth > dstBitWidth) {
+      auto newType =
+          convertScalarOrVectorFloatType(arg.getType(), dstElementType);
+      auto newOp = rewriter.create<arith::TruncFOp>(arg.getLoc(), newType, arg);
+      newArgs.push_back(newOp);
+    }
+  }
+  return newArgs;
+}
+//===----------------------------------------------------------------------===//
+// Operation conversion
+//===----------------------------------------------------------------------===//
+
+// Elementwise math to vc-intrinsics conversion pattern for ops that only
+// supports f32
+template <typename MOp>
+struct ElementwiseFloatOnlyMathOpPattern final
+    : public OpConversionPattern<MOp> {
+  using OpConversionPattern<MOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(MOp op, typename MOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type opElementType;
+    // Check if the result type is a 1D vector
+    if (auto vecTy = dyn_cast<VectorType>(op.getType())) {
+      if (vecTy.getRank() != 1)
+        return failure();
+      opElementType = vecTy.getElementType();
+    } else {
+      opElementType = op.getType();
+    }
+    auto loc = op.getLoc();
+    auto args = adaptor.getOperands();
+    // Upconvert or downconvert all the operands' element types to f32
+    // Warning message here for the truncation. If we are truncating
+    // the value, the result can be different from the original value.
+    if (opElementType.getIntOrFloatBitWidth() > 32)
+      emitWarning(op.getLoc(), "Truncation is done on input during conversion, "
+                               "may result in wrong result.\n");
+    llvm::SmallVector<Value> newArgs =
+        convertFloatArgsType(args, rewriter.getF32Type(), rewriter);
+
+    // Result element type is always f32
+    auto newType =
+        convertScalarOrVectorFloatType(op.getType(), rewriter.getF32Type());
+    std::string resStr = "f32";
+    resStr.insert(
+        0, ((dyn_cast<VectorType>(newType))
+                ? llvm::formatv("v{0}",
+                                dyn_cast<VectorType>(newType).getNumElements())
+                      .str()
+                : ""));
+
+    // for large vectors, generate the corresponding VC intrinsic.
+    auto funcName = getVCIntrinsicName<MOp>();
+    funcName += resStr;
+    auto callOp =
+        createFuncCall(rewriter, loc, funcName, {newType}, newArgs, false);
+
+    // Initialize a smallvector with the callOp
+    SmallVector<Value> callOpResult;
+    callOpResult.push_back(callOp.getResult(0));
+
+    // Convert the result of the call to the original type
+    auto originalResultType =
+        convertFloatArgsType(callOpResult, opElementType, rewriter);
+
+    rewriter.replaceOp(op, originalResultType);
+    return success();
+  }
+};
+
+// ExpOp conversion pattern, supports both math::exp and math::exp2
+template <typename ExpOp>
+struct ExpOpPattern final : public OpConversionPattern<ExpOp> {
+  ExpOpPattern(MLIRContext *ctx, bool enableHighPrecisionInterimCalculation)
+      : OpConversionPattern<ExpOp>(ctx),
+        enableHighPrecisionInterimCalculation(
+            enableHighPrecisionInterimCalculation) {}
+  LogicalResult
+  matchAndRewrite(ExpOp op, typename ExpOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto vecTy = dyn_cast<VectorType>(op.getType());
+
+    // Only deal with Exp op with 1-D vector type
+    if (!(vecTy && vecTy.getRank() == 1))
+      return failure();
+
+    auto loc = op.getLoc();
+
+    // "llvm.genx.exp" returns the base 2 exponentiation of the input.
+    // To get the base e exponentiation, we need to scale the input by log2(e)
+    bool isExpOp = std::is_same_v<ExpOp, math::ExpOp>;
+    auto operands = adaptor.getOperands();
+    SmallVector<Value> args{operands};
+    // Create a constant vector with the value of 1.442695040888963
+    if (isExpOp) {
+      // Create the intermediate instructions of f32 vector type if the element
+      // type is less than 32 bits
+      if (this->enableHighPrecisionInterimCalculation &&
+          vecTy.getElementType().getIntOrFloatBitWidth() < 32) {
+        auto interimVectorType =
+            VectorType::get(vecTy.getShape(), rewriter.getF32Type());
+        auto vecAttr = DenseElementsAttr::get(
+            interimVectorType,
+            rewriter.getFloatAttr(interimVectorType.getElementType(),
+                                  1.442695040888963));
+        auto log2eConstVec =
+            rewriter.create<arith::ConstantOp>(loc, interimVectorType, vecAttr);
+        auto input = convertFloatArgsType({operands[0]}, rewriter.getF32Type(),
+                                          rewriter);
+        auto scaledInputf32 = rewriter.create<arith::MulFOp>(
+            op.getLoc(), input[0], log2eConstVec);
+        auto scaledInput = convertFloatArgsType(
+            {scaledInputf32}, vecTy.getElementType(), rewriter);
+        args.clear();
+        args.push_back(scaledInput[0]);
+      } else {
+        auto vecAttr = DenseElementsAttr::get(
+            vecTy,
+            rewriter.getFloatAttr(vecTy.getElementType(), 1.442695040888963));
+        auto log2eConstVec =
+            rewriter.create<arith::ConstantOp>(loc, vecTy, vecAttr);
+        auto input = operands[0];
+        auto scaledInput =
+            rewriter.create<arith::MulFOp>(op.getLoc(), input, log2eConstVec);
+        args.clear();
+        args.push_back(scaledInput);
+      }
+    }
+    // for large vectors, generate the corresponding VC intrinsic.
+    auto funcName = getVCIntrinsicName<ExpOp>();
+    funcName += encodeVectorType(rewriter, vecTy).first;
+    auto callOp =
+        createFuncCall(rewriter, loc, funcName, {op.getType()}, args, false);
+    rewriter.replaceOp(op, callOp);
+    return success();
+  }
+
+private:
+  const bool enableHighPrecisionInterimCalculation;
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pattern Population
+//===----------------------------------------------------------------------===//
+
+void imex::populateMathToVCPatterns(
+    ::mlir::LLVMTypeConverter &typeConverter,
+    ::mlir::RewritePatternSet &patterns,
+    bool enableHighPrecisionInterimCalculation) {
+  // Add patterns
+  patterns.add<ElementwiseFloatOnlyMathOpPattern<math::CeilOp>,
+               ElementwiseFloatOnlyMathOpPattern<math::FloorOp>>(
+      patterns.getContext());
+  patterns.add<ExpOpPattern<math::ExpOp>, ExpOpPattern<math::Exp2Op>>(
+      patterns.getContext(), enableHighPrecisionInterimCalculation);
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion Legality configuration
+//===----------------------------------------------------------------------===//
+void imex::configureMathToVCConversionLegality(
+    ::mlir::ConversionTarget &target) {
+  // Add legal dialects
+  target.addLegalDialect<func::FuncDialect, arith::ArithDialect>();
+  // math.exp and math.exp2 is only converted if they are 1D vectors
+  target.addDynamicallyLegalOp<math::ExpOp, math::Exp2Op>([&](Operation *op) {
+    if (auto vecTy = dyn_cast<VectorType>(op->getResult(0).getType())) {
+      if (vecTy.getRank() != 1)
+        return true;
+      return false;
+    }
+    return true;
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct MathToVCPass : public imex::impl::ConvertMathToVCBase<MathToVCPass> {
+  using Base::Base;
+  MathToVCPass(bool emitDeallocs)
+      : imex::impl::ConvertMathToVCBase<MathToVCPass>() {
+    this->enableHighPrecisionInterimCalculation.setValue(emitDeallocs);
+  }
+  void runOnOperation() override {
+    gpu::GPUModuleOp m = getOperation();
+    LLVMTypeConverter typeConverter(&getContext());
+    ConversionTarget target(getContext());
+    RewritePatternSet patterns(&getContext());
+
+    // Add patterns
+    imex::populateMathToVCPatterns(
+        typeConverter, patterns,
+        this->enableHighPrecisionInterimCalculation.getValue());
+    configureMathToVCConversionLegality(target);
+
+    if (failed(applyPartialConversion(m, target, std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+imex::createConvertMathToVCPass() {
+  return std::make_unique<MathToVCPass>();
+}
diff --git a/lib/Conversion/XeGPUToVC/CMakeLists.txt b/lib/Conversion/XeGPUToVC/CMakeLists.txt
index f3f0d6699..262a04ec9 100644
--- a/lib/Conversion/XeGPUToVC/CMakeLists.txt
+++ b/lib/Conversion/XeGPUToVC/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_imex_conversion_library(IMEXXeGPUToVC
   LSCPatterns.cpp
   XeGPUToVC.cpp
-  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/XeGPUToVC
diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
index c6dfec61b..4c81c523b 100644
--- a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
+++ b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
@@ -38,7 +38,7 @@
 #include "llvm/Support/FormatVariadic.h"
 
 #include "LscIntrinsicEnums.h"
-#include "Utils.h"
+#include "imex/Utils/VCUtils.h"
 
 using namespace mlir;
 using mlir::xegpu::AtomicRMWOp;
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
index ed9369a4c..19ea6d699 100644
--- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
+++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -34,13 +34,13 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 
+#include "imex/Conversion/MathToVC/MathToVC.h"
+#include "imex/Utils/VCUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 
-#include "Utils.h"
-
 namespace imex {
 #define GEN_PASS_DEF_CONVERTXEGPUTOVC
 #include "imex/Conversion/Passes.h.inc"
@@ -889,15 +889,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
       return true;
     });
 
-    target.addDynamicallyLegalOp<math::ExpOp>([&](math::ExpOp op) {
-      if (auto vecTy = dyn_cast<VectorType>(op.getType())) {
-        if (vecTy.getRank() != 1)
-          return true;
-        return false;
-      }
-      return true;
-    });
-
     target.addIllegalOp<ShapeCastOp>();
 
     // TODO: can we change it to addDynamicLegalOp?
@@ -962,8 +953,7 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
 
     // Ops to llvm.genx only Patterns
     patterns.add<NbarrierWaitPattern, CompilerHintPattern,
-                 ElementwiseToVCPattern<arith::MaximumFOp>,
-                 ElementwiseToVCPattern<math::ExpOp>, DpasPattern,
+                 ElementwiseToVCPattern<arith::MaximumFOp>, DpasPattern,
                  NbarrierArrivePattern>(patterns.getContext());
 
     // Ops to LSC only patterns
@@ -971,6 +961,10 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
 
     populateLoadStoreLSCPatterns(typeConverter, patterns);
 
+    populateMathToVCPatterns(typeConverter, patterns);
+
+    configureMathToVCConversionLegality(target);
+
     if (failed(applyPartialConversion(m, target, std::move(patterns))))
       return signalPassFailure();
   }
diff --git a/lib/Utils/CMakeLists.txt b/lib/Utils/CMakeLists.txt
index 7b6ad3baa..75dcf17cf 100644
--- a/lib/Utils/CMakeLists.txt
+++ b/lib/Utils/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_library(IMEXUtil
     FuncUtils.cpp
     PassUtils.cpp
     TypeConversion.cpp
+    VCUtils.cpp
     XeCommon.cpp
     XeArch.cpp
 
diff --git a/lib/Conversion/XeGPUToVC/Utils.cpp b/lib/Utils/VCUtils.cpp
similarity index 95%
rename from lib/Conversion/XeGPUToVC/Utils.cpp
rename to lib/Utils/VCUtils.cpp
index 7efc9efd0..f2c4f2c2f 100644
--- a/lib/Conversion/XeGPUToVC/Utils.cpp
+++ b/lib/Utils/VCUtils.cpp
@@ -8,10 +8,11 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file implements some utils used in XeGPUToVC pass
+/// This file implements some utils used in ConversionToVC passes
 ///
 //===----------------------------------------------------------------------===//
 
+#include "imex/Utils/VCUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -28,7 +29,7 @@ FlatSymbolRefAttr getFuncRefAttr(gpu::GPUModuleOp module, StringRef name,
                                  TypeRange resultType, ValueRange operands,
                                  bool isVectorComputeFunction,
                                  bool emitCInterface,
-                                 bool emitSPIRVLinkage = true) {
+                                 bool emitSPIRVLinkage /* = true*/) {
   MLIRContext *context = module.getContext();
   auto result = SymbolRefAttr::get(context, name);
 
diff --git a/lib/Utils/XeCommon.cpp b/lib/Utils/XeCommon.cpp
index e4a71c197..c701d9155 100644
--- a/lib/Utils/XeCommon.cpp
+++ b/lib/Utils/XeCommon.cpp
@@ -12,10 +12,10 @@
 /// routines used by Xe related dialects.
 ///
 //===----------------------------------------------------------------------===//
-
 #include <mlir/Dialect/Func/IR/FuncOps.h>
 #include <mlir/Dialect/GPU/IR/GPUDialect.h>
 #include <mlir/Dialect/SCF/IR/SCF.h>
+#include <unordered_set>
 
 #include "imex/Dialect/XeTile/IR/XeTileOps.h"
 #include "imex/Utils/DebugUtils.h"
@@ -211,4 +211,16 @@ mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
   return op;
 }
 
+/// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL
+/// capability. In other words, the vector size is not supported by SPIR-V.
+/// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16
+/// capability).
+bool isVectorAnyINTELType(mlir::Type type) {
+  std::unordered_set<int64_t> spirvSupportedSizes = {2, 3, 4, 8, 16};
+  auto vecType = mlir::dyn_cast<mlir::VectorType>(type);
+  return vecType && vecType.getRank() == 1 &&
+         (spirvSupportedSizes.find(vecType.getNumElements()) ==
+          spirvSupportedSizes.end());
+}
+
 } // namespace imex
diff --git a/test/Conversion/MathToVC/math-to-vc.mlir b/test/Conversion/MathToVC/math-to-vc.mlir
new file mode 100644
index 000000000..f0fb831dd
--- /dev/null
+++ b/test/Conversion/MathToVC/math-to-vc.mlir
@@ -0,0 +1,78 @@
+// RUN: imex-opt -convert-math-to-vc -verify-diagnostics  %s | FileCheck %s --check-prefixes=CHECK
+// RUN: imex-opt -convert-math-to-vc="enable-high-precision-interim-calculation=true" -verify-diagnostics %s | FileCheck %s --check-prefixes=HIGH_PRECISION
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @math_to_vc {
+    // CHECK-LABEL: gpu.func @ceil_f16
+    gpu.func @ceil_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[EXTF_F32:.*]] = arith.extf {{.*}} : vector<16xf16> to vector<16xf32>
+      // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndu.v16f32(%[[EXTF_F32]]) : (vector<16xf32>) -> vector<16xf32>
+      // CHECK-NEXT: %[[TRUNC_F16:.*]] = arith.truncf %[[CEILF]] : vector<16xf32> to vector<16xf16>
+      %2 = math.ceil %0 : vector<16xf16>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @ceil_f64
+    gpu.func @ceil_f64(%arg0: memref<8x16xf64>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf64>, vector<16xf64>
+      // CHECK: %[[TRUNCF_F32:.*]] = arith.truncf {{.*}} : vector<16xf64> to vector<16xf32>
+      // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndu.v16f32(%[[TRUNCF_F32]]) : (vector<16xf32>) -> vector<16xf32>
+      // CHECK-NEXT: %[[EXTF_F64:.*]] = arith.extf %[[CEILF]] : vector<16xf32> to vector<16xf64>
+      // expected-warning@+1 {{Truncation is done on input during conversion, may result in wrong result.}}
+      %2 = math.ceil %0 : vector<16xf64>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @floor_f16
+    gpu.func @floor_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[EXTF_F32:.*]] = arith.extf {{.*}} : vector<16xf16> to vector<16xf32>
+      // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndd.v16f32(%[[EXTF_F32]]) : (vector<16xf32>) -> vector<16xf32>
+      // CHECK-NEXT: %[[TRUNC_F16:.*]] = arith.truncf %[[CEILF]] : vector<16xf32> to vector<16xf16>
+      %2 = math.floor %0 : vector<16xf16>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @exp_f16
+    // HIGH_PRECISION-LABEL: gpu.func @exp_f16
+    gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      // HIGH_PRECISION: %[[VEC:.*]] = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf16>
+      // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
+      // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]])
+      // HIGH_PRECISION: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf32>
+      // HIGH_PRECISION: %[[VEC_F32:.*]] = arith.extf %[[VEC]] : vector<16xf16> to vector<16xf32>
+      // HIGH_PRECISION: %[[MULF_F32:.*]] = arith.mulf %[[VEC_F32]], %[[LOG2E_VEC]] : vector<16xf32>
+      // HIGH_PRECISION: %[[MULF:.*]] = arith.truncf %[[MULF_F32]] : vector<16xf32> to vector<16xf16>
+      // HIGH_PRECISION: func.call @llvm.genx.exp.v8i32(%[[MULF]])
+      %2 = math.exp %v1 : vector<16xf16>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @exp2_f16
+    gpu.func @exp2_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      // CHECK: %[[VEC:.*]] = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[VEC]])
+      %2 = math.exp2 %v1 : vector<16xf16>
+      gpu.return
+    }
+  }
+}
+
+// -----
diff --git a/test/Conversion/XeGPUToVC/eltwise.mlir b/test/Conversion/XeGPUToVC/eltwise.mlir
index f177e0f2d..24d59e3ac 100644
--- a/test/Conversion/XeGPUToVC/eltwise.mlir
+++ b/test/Conversion/XeGPUToVC/eltwise.mlir
@@ -7,8 +7,7 @@ module @gemm attributes {gpu.container_module} {
       %c0 = arith.constant 0 : index
       %cv1 = arith.constant dense<1.0> : vector<16xf32>
       %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<16xf32>
-      // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f32
-      // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f32 to vector<16xf32>
+      // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf32>
       // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
       // CHECK-NEXT: func.call @llvm.genx.exp.v16f32(%[[MULF]])
       %1 = math.exp %v1 fastmath<nnan> : vector<16xf32>
@@ -31,8 +30,7 @@ module @gemm attributes {gpu.container_module} {
     gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
       %c0 = arith.constant 0 : index
       %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
-      // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f16
-      // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f16 to vector<16xf16>
+      // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf16>
       // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
       // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]])
       %2 = math.exp %v1 : vector<16xf16>
diff --git a/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir b/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir
new file mode 100644
index 000000000..c6498e8d7
--- /dev/null
+++ b/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir
@@ -0,0 +1,53 @@
+// RUN: IMEX_USE_IGC_VECTOR_BACK_END=1 %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner  -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: IMEX_USE_IGC_VECTOR_BACK_END=1 %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner  -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+module @eltwise_add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01>
+  func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> {
+    %c20 = arith.constant 20 : index
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<10x20xbf16>
+    memref.copy %arg1, %memref : memref<10x20xbf16> to memref<10x20xbf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    memref.copy %arg0, %memref_0 : memref<10x20xbf16> to memref<10x20xbf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1)  args(%memref_0 : memref<10x20xbf16>, %memref : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>)
+    %alloc = memref.alloc() : memref<10x20xbf16>
+    memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16>
+    gpu.dealloc  %memref_1 : memref<10x20xbf16>
+    gpu.dealloc  %memref_0 : memref<10x20xbf16>
+    gpu.dealloc  %memref : memref<10x20xbf16>
+    return %alloc : memref<10x20xbf16>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL, Bfloat16ConversionINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute, SPV_INTEL_bfloat16_conversion]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 10, 20, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %block_id_x = gpu.block_id  x
+      %block_id_y = gpu.block_id  y
+      %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %2 = math.ceil %0 : bf16
+      %3 = math.floor %1 : bf16
+      %4 = arith.addf %2, %3 : bf16
+      memref.store %4, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      gpu.return
+    }
+  }
+  func.func @main() {
+    %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16>
+    %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16>
+    // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
+    // CHECK-COUNT-200: 1
+    call @printMemrefBF16(%cast) : (memref<*xbf16>) -> ()
+    return
+  }
+   func.func private @printMemrefBF16(memref<*xbf16>)  attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/Gpu/gpu-to-llvm.pp b/test/Integration/Dialect/Gpu/gpu-to-llvm.pp
index 9effd7727..5ed75515b 100644
--- a/test/Integration/Dialect/Gpu/gpu-to-llvm.pp
+++ b/test/Integration/Dialect/Gpu/gpu-to-llvm.pp
@@ -4,6 +4,7 @@
 // Ready for imex runner starting from GPU dialect.
 builtin.module(
     imex-vector-linearize
+    gpu.module(convert-math-to-vc{enable-high-precision-interim-calculation=true})
     reconcile-unrealized-casts
     bf16-to-gpu
     imex-convert-gpu-to-spirv
diff --git a/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir b/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir
new file mode 100644
index 000000000..9ee678a7c
--- /dev/null
+++ b/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir
@@ -0,0 +1,84 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0>
+  func.func @test(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32>
+    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg1, %memref_1 : memref<8x16xf32> to memref<8x16xf32>
+    %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c8, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_1 : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
+    gpu.dealloc  %memref_1 : memref<8x16xf32>
+    return %memref_2 : memref<8x16xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %thread_id_x = gpu.thread_id x
+      cf.br ^bb1
+    ^bb1:
+      %0 = xegpu.create_nd_tdesc %arg1[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %2 = xegpu.create_nd_tdesc %arg0[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      %3 = xegpu.load_nd %2  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %4 = math.ceil %1 : vector<16xf32>
+      %5 = math.floor %3 : vector<16xf32>
+      %6 = arith.addf %4, %5 : vector<16xf32>
+      %7 = xegpu.create_nd_tdesc %arg2[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      xegpu.store_nd %6, %7  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c_gen_int = arith.constant 0 : i1
+    %cf_lower = arith.constant -0.5 : f32
+    %cf_upper = arith.constant 0.5 : f32
+
+    %A = memref.alloc() : memref<8x16xf32>
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
+
+    %B = memref.alloc() : memref<8x16xf32>
+    %B_random = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%B_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
+
+    // calculate the result C matrix
+    %c16 = arith.constant 16 : index
+    %c8 = arith.constant 8 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %ref = memref.alloc() : memref<8x16xf32>
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %a = memref.load %A[%i, %j] : memref<8x16xf32>
+        %b = memref.load %B[%i, %j] : memref<8x16xf32>
+        %a_ceiled = math.ceil %a : f32
+        %b_floored = math.floor %b : f32
+        %c = arith.addf %a_ceiled, %b_floored : f32
+        memref.store %c, %ref[%i, %j] : memref<8x16xf32>
+      }
+    }
+
+    %C = call @test(%A, %B) : (memref<8x16xf32>, memref<8x16xf32>) -> memref<8x16xf32>
+
+    %C_cast = memref.cast %C : memref<8x16xf32> to memref<*xf32>
+    %ref_cast = memref.cast %ref : memref<8x16xf32> to memref<*xf32>
+    // call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%ref_cast, %C_cast) : (memref<*xf32>, memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp
index c3ca033ea..faa5343c5 100644
--- a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp
+++ b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp
@@ -7,6 +7,7 @@
         imex-xegpu-apply-vnni-transformation,
         imex-xegpu-optimize-transpose)
     imex-vector-linearize
+    gpu.module(convert-math-to-vc{enable-high-precision-interim-calculation=true})
     gpu.module(convert-xegpu-to-vc)
     cse
     reconcile-unrealized-casts