diff --git a/include/imex/Conversion/CMakeLists.txt b/include/imex/Conversion/CMakeLists.txt
index db25c8fc0..635dcd12f 100644
--- a/include/imex/Conversion/CMakeLists.txt
+++ b/include/imex/Conversion/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(DistToStandard)
 add_subdirectory(DropRegions)
 add_subdirectory(XeTileToXeGPU)
 add_subdirectory(XeGPUToVC)
+add_subdirectory(MathToVC)
diff --git a/include/imex/Conversion/MathToVC/CMakeLists.txt b/include/imex/Conversion/MathToVC/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/include/imex/Conversion/MathToVC/MathToVC.h b/include/imex/Conversion/MathToVC/MathToVC.h
new file mode 100644
index 000000000..7128cd874
--- /dev/null
+++ b/include/imex/Conversion/MathToVC/MathToVC.h
@@ -0,0 +1,51 @@
+//===- MathToVC.h - Conversion---------------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements conversion of the select math dialect operations into
+/// Func dialect calls to vc-intrinsics functions
+///
+//===----------------------------------------------------------------------===//
+#ifndef IMEX_CONVERSION_MATHTOVC_H
+#define IMEX_CONVERSION_MATHTOVC_H
+
+#include <mlir/Dialect/Math/IR/Math.h>
+#include <mlir/Dialect/Vector/IR/VectorOps.h>
+
+#include "imex/Utils/XeCommon.h"
+
+namespace mlir {
+
+class ConversionTarget;
+class LLVMTypeConverter;
+class Pass;
+class Operation;
+class RewritePatternSet;
+template <typename T> class OperationPass;
+
+namespace gpu {
+class GPUModuleOp;
+} // namespace gpu
+
+} // namespace mlir
+
+namespace imex {
+#define GEN_PASS_DECL_CONVERTMATHTOVC
+#include "imex/Conversion/Passes.h.inc"
+
+void populateMathToVCPatterns(
+    ::mlir::LLVMTypeConverter &typeConverter,
+    ::mlir::RewritePatternSet &patterns,
+    bool enableHighPrecisionInterimCalculation = false);
+void configureMathToVCConversionLegality(::mlir::ConversionTarget &target);
+std::unique_ptr<::mlir::OperationPass<::mlir::gpu::GPUModuleOp>>
+createConvertMathToVCPass();
+
+} // namespace imex
+#endif
diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h
index d466ba422..0cb9c8c81 100644
--- a/include/imex/Conversion/Passes.h
+++ b/include/imex/Conversion/Passes.h
@@ -22,6 +22,7 @@
 #include <imex/Conversion/GPUToGPUX/GPUToGPUX.h>
 #include <imex/Conversion/GPUToSPIRV/GPUToSPIRVPass.h>
 #include <imex/Conversion/GPUXToLLVM/GPUXToLLVMPass.h>
+#include <imex/Conversion/MathToVC/MathToVC.h>
 #include <imex/Conversion/NDArrayToLinalg/NDArrayToLinalg.h>
 #include <imex/Conversion/XeGPUToVC/XeGPUToVC.h>
 #include <imex/Conversion/XeTileToXeGPU/XeTileToXeGPU.h>
diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td
index 40b089050..1ad939d4a 100644
--- a/include/imex/Conversion/Passes.td
+++ b/include/imex/Conversion/Passes.td
@@ -427,4 +427,34 @@ def ConvertXeGPUToVC : Pass<"convert-xegpu-to-vc", "::mlir::gpu::GPUModuleOp"> {
   let constructor = "imex::createConvertXeGPUToVCPass()";
 }
 
+
+//===----------------------------------------------------------------------===//
+// MathToVC
+//===----------------------------------------------------------------------===//
+// high-precision-interim-calculation
+def ConvertMathToVC : Pass<"convert-math-to-vc", "::mlir::gpu::GPUModuleOp"> {
+  let summary = "Generate vc-intrinsics functions for select math dialect operations";
+  let description = [{
+    Convert select math dialect operations into the Func dialect calls to vc-intrinsics
+    functions.
+    Some math operations are not supported by the VC compiler (IGC vector backend)
+    and need to be converted to vc-intrinsic calls.
+    This pass converts these math operations to vc-intrinsics.
+    }];
+  let options = [
+    Option<"enableHighPrecisionInterimCalculation", "enable-high-precision-interim-calculation", "bool",
+           /*default=*/"false",
+           "Enables high precision (f32) interim  calculation for math operations."
+           "For any interim instruction added as part of the conversion will be high precision(f32).">
+  ];
+
+  let dependentDialects = ["::mlir::math::MathDialect",
+                           "::mlir::vector::VectorDialect",
+                            "::mlir::LLVM::LLVMDialect",
+                            "::mlir::func::FuncDialect",
+                           "::mlir::arith::ArithDialect"
+                            ];
+  let constructor = "imex::createConvertMathToVCPass()";
+}
+
 #endif // _IMEX_CONVERSION_PASSES_TD_INCLUDED_
diff --git a/lib/Conversion/XeGPUToVC/Utils.h b/include/imex/Utils/VCUtils.h
similarity index 89%
rename from lib/Conversion/XeGPUToVC/Utils.h
rename to include/imex/Utils/VCUtils.h
index 0f9c06fb9..20ac41811 100644
--- a/lib/Conversion/XeGPUToVC/Utils.h
+++ b/include/imex/Utils/VCUtils.h
@@ -8,14 +8,20 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines some utils used in XeGPUToVC pass
+/// This file defines some utils used in ConversionToVC passes
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef XEGPU_VC_UTILS_H
-#define XEGPU_VC_UTILS_H
+#ifndef VC_UTILS_H
+#define VC_UTILS_H
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
 
diff --git a/include/imex/Utils/XeCommon.h b/include/imex/Utils/XeCommon.h
index d14603e10..f2d08884a 100644
--- a/include/imex/Utils/XeCommon.h
+++ b/include/imex/Utils/XeCommon.h
@@ -617,6 +617,12 @@ llvm::SmallVector<T> swapLastTwoElements(llvm::ArrayRef<T> shape) {
 ///   output strides = 60x20x5x1
 llvm::SmallVector<int64_t> defaultStrides(llvm::ArrayRef<int64_t> shape);
 
+/// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL
+/// capability. In other words, the vector size is not supported by SPIR-V.
+/// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16
+/// capability).
+bool isVectorAnyINTELType(mlir::Type type);
+
 } // namespace imex
 
 #endif
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 813781dd3..b52337e57 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -4,5 +4,6 @@ add_subdirectory(DropRegions)
 add_subdirectory(GPUToSPIRV)
 add_subdirectory(GPUToGPUX)
 add_subdirectory(GPUXToLLVM)
+add_subdirectory(MathToVC)
 add_subdirectory(XeTileToXeGPU)
 add_subdirectory(XeGPUToVC)
diff --git a/lib/Conversion/MathToVC/CMakeLists.txt b/lib/Conversion/MathToVC/CMakeLists.txt
new file mode 100644
index 000000000..ab5d76327
--- /dev/null
+++ b/lib/Conversion/MathToVC/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_imex_conversion_library(IMEXMathToVC
+  MathToVC.cpp
+
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/MathToVC
+
+  DEPENDS
+  IMEXConversionPassIncGen
+
+  #LINK_COMPONENTS
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRSupport
+  # MLIRTransforms
+  MLIRLLVMCommonConversion
+
+  MLIRGPUDialect
+  MLIRPass
+  )
diff --git a/lib/Conversion/MathToVC/MathToVC.cpp b/lib/Conversion/MathToVC/MathToVC.cpp
new file mode 100644
index 000000000..0e0121c91
--- /dev/null
+++ b/lib/Conversion/MathToVC/MathToVC.cpp
@@ -0,0 +1,318 @@
+//===- MathToVC.cpp - Conversion---------------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements conversion of the select math dialect operations into
+/// Func dialect calls to vc-intrinsics functions
+///
+//===----------------------------------------------------------------------===//
+
+#include "imex/Conversion/MathToVC/MathToVC.h"
+#include "imex/Utils/VCUtils.h"
+#include "imex/Utils/XeCommon.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace imex {
+#define GEN_PASS_DEF_CONVERTMATHTOVC
+#include "imex/Conversion/Passes.h.inc"
+} // namespace imex
+
+using namespace mlir;
+using namespace imex;
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+// Get the VC intrinsic name for the given math operation
+template <typename MOp> std::string getVCIntrinsicName() {
+  constexpr bool isCeilOp = std::is_same_v<MOp, math::CeilOp>;
+  constexpr bool isFloorOp = std::is_same_v<MOp, math::FloorOp>;
+  constexpr bool isExpOp = std::is_same_v<MOp, math::ExpOp>;
+  constexpr bool isExp2Op = std::is_same_v<MOp, math::Exp2Op>;
+  if (isCeilOp)
+    return "llvm.genx.rndu.";
+  else if (isFloorOp)
+    return "llvm.genx.rndd.";
+  else if (isExpOp || isExp2Op)
+    return "llvm.genx.exp.";
+  else
+    assert(0 && "Unsupported math Op. Add more support!");
+}
+
+// Utility function to convert a scalar or vector type of any float bitwidth to
+// another.
+Type convertScalarOrVectorFloatType(Type srcType, Type dstElementType) {
+  // get a vector type or scalar type of dstElementType with the same shape as
+  // srcType
+  if (auto vecTy = dyn_cast<VectorType>(srcType)) {
+    auto newTy = VectorType::get(vecTy.getShape(), dstElementType);
+    return newTy;
+  } else if (auto scalarTy = dyn_cast<FloatType>(srcType)) {
+    return dstElementType;
+  } else {
+    assert(0 && "Unsupported type");
+  }
+}
+
+// Utility function to convert a range float args to a specific float type
+// The function converts the float args to the dstElementType
+// It generates an extension or truncation operation if the bitwidth of the src
+// and dst types are different
+SmallVector<Value> convertFloatArgsType(SmallVector<Value> args,
+                                        Type dstElementType,
+                                        ConversionPatternRewriter &rewriter) {
+  SmallVector<Value> newArgs;
+  auto dstBitWidth = dstElementType.getIntOrFloatBitWidth();
+  for (auto arg : args) {
+    // Assert if src and dst types are not float types
+    assert(((isa<FloatType>(arg.getType()) ||
+             isa<FloatType>(
+                 dyn_cast<VectorType>(arg.getType()).getElementType())) &&
+            isa<FloatType>(dstElementType)) &&
+           "Unsupported type, src and dst both should be float types");
+    auto srcBitWidth =
+        dyn_cast<VectorType>(arg.getType())
+            ? dyn_cast<VectorType>(arg.getType()).getElementTypeBitWidth()
+            : arg.getType().getIntOrFloatBitWidth();
+
+    if (srcBitWidth == dstBitWidth)
+      newArgs.push_back(arg);
+    else if (srcBitWidth < dstBitWidth) {
+      auto newType =
+          convertScalarOrVectorFloatType(arg.getType(), dstElementType);
+      auto newOp = rewriter.create<arith::ExtFOp>(arg.getLoc(), newType, arg);
+      newArgs.push_back(newOp);
+    } else if (srcBitWidth > dstBitWidth) {
+      auto newType =
+          convertScalarOrVectorFloatType(arg.getType(), dstElementType);
+      auto newOp = rewriter.create<arith::TruncFOp>(arg.getLoc(), newType, arg);
+      newArgs.push_back(newOp);
+    }
+  }
+  return newArgs;
+}
+//===----------------------------------------------------------------------===//
+// Operation conversion
+//===----------------------------------------------------------------------===//
+
+// Elementwise math to vc-intrinsics conversion pattern for ops that only
+// supports f32
+template <typename MOp>
+struct ElementwiseFloatOnlyMathOpPattern final
+    : public OpConversionPattern<MOp> {
+  using OpConversionPattern<MOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(MOp op, typename MOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type opElementType;
+    // Check if the result type is a 1D vector
+    if (auto vecTy = dyn_cast<VectorType>(op.getType())) {
+      if (vecTy.getRank() != 1)
+        return failure();
+      opElementType = vecTy.getElementType();
+    } else {
+      opElementType = op.getType();
+    }
+    auto loc = op.getLoc();
+    auto args = adaptor.getOperands();
+    // Upconvert or downconvert all the operands' element types to f32
+    // Warning message here for the truncation. If we are truncating
+    // the value, the result can be different from the original value.
+    if (opElementType.getIntOrFloatBitWidth() > 32)
+      emitWarning(op.getLoc(), "Truncation is done on input during conversion, "
+                               "may result in wrong result.\n");
+    llvm::SmallVector<Value> newArgs =
+        convertFloatArgsType(args, rewriter.getF32Type(), rewriter);
+
+    // Result element type is always f32
+    auto newType =
+        convertScalarOrVectorFloatType(op.getType(), rewriter.getF32Type());
+    std::string resStr = "f32";
+    resStr.insert(
+        0, ((dyn_cast<VectorType>(newType))
+                ? llvm::formatv("v{0}",
+                                dyn_cast<VectorType>(newType).getNumElements())
+                      .str()
+                : ""));
+
+    // for large vectors, generate the corresponding VC intrinsic.
+    auto funcName = getVCIntrinsicName<MOp>();
+    funcName += resStr;
+    auto callOp =
+        createFuncCall(rewriter, loc, funcName, {newType}, newArgs, false);
+
+    // Initialize a smallvector with the callOp
+    SmallVector<Value> callOpResult;
+    callOpResult.push_back(callOp.getResult(0));
+
+    // Convert the result of the call to the original type
+    auto originalResultType =
+        convertFloatArgsType(callOpResult, opElementType, rewriter);
+
+    rewriter.replaceOp(op, originalResultType);
+    return success();
+  }
+};
+
+// ExpOp conversion pattern, supports both math::exp and math::exp2
+template <typename ExpOp>
+struct ExpOpPattern final : public OpConversionPattern<ExpOp> {
+  ExpOpPattern(MLIRContext *ctx, bool enableHighPrecisionInterimCalculation)
+      : OpConversionPattern<ExpOp>(ctx),
+        enableHighPrecisionInterimCalculation(
+            enableHighPrecisionInterimCalculation) {}
+  LogicalResult
+  matchAndRewrite(ExpOp op, typename ExpOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto vecTy = dyn_cast<VectorType>(op.getType());
+
+    // Only deal with Exp op with 1-D vector type
+    if (!(vecTy && vecTy.getRank() == 1))
+      return failure();
+
+    auto loc = op.getLoc();
+
+    // "llvm.genx.exp" returns the base 2 exponentiation of the input.
+    // To get the base e exponentiation, we need to scale the input by log2(e)
+    bool isExpOp = std::is_same_v<ExpOp, math::ExpOp>;
+    auto operands = adaptor.getOperands();
+    SmallVector<Value> args{operands};
+    // Create a constant vector with the value of 1.442695040888963
+    if (isExpOp) {
+      // Create the intermediate instructions of f32 vector type if the element
+      // type is less than 32 bits
+      if (this->enableHighPrecisionInterimCalculation &&
+          vecTy.getElementType().getIntOrFloatBitWidth() < 32) {
+        auto interimVectorType =
+            VectorType::get(vecTy.getShape(), rewriter.getF32Type());
+        auto vecAttr = DenseElementsAttr::get(
+            interimVectorType,
+            rewriter.getFloatAttr(interimVectorType.getElementType(),
+                                  1.442695040888963));
+        auto log2eConstVec =
+            rewriter.create<arith::ConstantOp>(loc, interimVectorType, vecAttr);
+        auto input = convertFloatArgsType({operands[0]}, rewriter.getF32Type(),
+                                          rewriter);
+        auto scaledInputf32 = rewriter.create<arith::MulFOp>(
+            op.getLoc(), input[0], log2eConstVec);
+        auto scaledInput = convertFloatArgsType(
+            {scaledInputf32}, vecTy.getElementType(), rewriter);
+        args.clear();
+        args.push_back(scaledInput[0]);
+      } else {
+        auto vecAttr = DenseElementsAttr::get(
+            vecTy,
+            rewriter.getFloatAttr(vecTy.getElementType(), 1.442695040888963));
+        auto log2eConstVec =
+            rewriter.create<arith::ConstantOp>(loc, vecTy, vecAttr);
+        auto input = operands[0];
+        auto scaledInput =
+            rewriter.create<arith::MulFOp>(op.getLoc(), input, log2eConstVec);
+        args.clear();
+        args.push_back(scaledInput);
+      }
+    }
+    // for large vectors, generate the corresponding VC intrinsic.
+    auto funcName = getVCIntrinsicName<ExpOp>();
+    funcName += encodeVectorType(rewriter, vecTy).first;
+    auto callOp =
+        createFuncCall(rewriter, loc, funcName, {op.getType()}, args, false);
+    rewriter.replaceOp(op, callOp);
+    return success();
+  }
+
+private:
+  const bool enableHighPrecisionInterimCalculation;
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pattern Population
+//===----------------------------------------------------------------------===//
+
+void imex::populateMathToVCPatterns(
+    ::mlir::LLVMTypeConverter &typeConverter,
+    ::mlir::RewritePatternSet &patterns,
+    bool enableHighPrecisionInterimCalculation) {
+  // Add patterns
+  patterns.add<ElementwiseFloatOnlyMathOpPattern<math::CeilOp>,
+               ElementwiseFloatOnlyMathOpPattern<math::FloorOp>>(
+      patterns.getContext());
+  patterns.add<ExpOpPattern<math::ExpOp>, ExpOpPattern<math::Exp2Op>>(
+      patterns.getContext(), enableHighPrecisionInterimCalculation);
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion Legality configuration
+//===----------------------------------------------------------------------===//
+void imex::configureMathToVCConversionLegality(
+    ::mlir::ConversionTarget &target) {
+  // Add legal dialects
+  target.addLegalDialect<func::FuncDialect, arith::ArithDialect>();
+  // math.exp and math.exp2 is only converted if they are 1D vectors
+  target.addDynamicallyLegalOp<math::ExpOp, math::Exp2Op>([&](Operation *op) {
+    if (auto vecTy = dyn_cast<VectorType>(op->getResult(0).getType())) {
+      if (vecTy.getRank() != 1)
+        return true;
+      return false;
+    }
+    return true;
+  });
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct MathToVCPass : public imex::impl::ConvertMathToVCBase<MathToVCPass> {
+  using Base::Base;
+  MathToVCPass(bool emitDeallocs)
+      : imex::impl::ConvertMathToVCBase<MathToVCPass>() {
+    this->enableHighPrecisionInterimCalculation.setValue(emitDeallocs);
+  }
+  void runOnOperation() override {
+    gpu::GPUModuleOp m = getOperation();
+    LLVMTypeConverter typeConverter(&getContext());
+    ConversionTarget target(getContext());
+    RewritePatternSet patterns(&getContext());
+
+    // Add patterns
+    imex::populateMathToVCPatterns(
+        typeConverter, patterns,
+        this->enableHighPrecisionInterimCalculation.getValue());
+    configureMathToVCConversionLegality(target);
+
+    if (failed(applyPartialConversion(m, target, std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+imex::createConvertMathToVCPass() {
+  return std::make_unique<MathToVCPass>();
+}
diff --git a/lib/Conversion/XeGPUToVC/CMakeLists.txt b/lib/Conversion/XeGPUToVC/CMakeLists.txt
index f3f0d6699..262a04ec9 100644
--- a/lib/Conversion/XeGPUToVC/CMakeLists.txt
+++ b/lib/Conversion/XeGPUToVC/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_imex_conversion_library(IMEXXeGPUToVC
   LSCPatterns.cpp
   XeGPUToVC.cpp
-  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/XeGPUToVC
diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
index c6dfec61b..4c81c523b 100644
--- a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
+++ b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
@@ -38,7 +38,7 @@
 #include "llvm/Support/FormatVariadic.h"
 
 #include "LscIntrinsicEnums.h"
-#include "Utils.h"
+#include "imex/Utils/VCUtils.h"
 
 using namespace mlir;
 using mlir::xegpu::AtomicRMWOp;
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
index ed9369a4c..19ea6d699 100644
--- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
+++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -34,13 +34,13 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 
+#include "imex/Conversion/MathToVC/MathToVC.h"
+#include "imex/Utils/VCUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 
-#include "Utils.h"
-
 namespace imex {
 #define GEN_PASS_DEF_CONVERTXEGPUTOVC
 #include "imex/Conversion/Passes.h.inc"
@@ -889,15 +889,6 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
       return true;
     });
 
-    target.addDynamicallyLegalOp<math::ExpOp>([&](math::ExpOp op) {
-      if (auto vecTy = dyn_cast<VectorType>(op.getType())) {
-        if (vecTy.getRank() != 1)
-          return true;
-        return false;
-      }
-      return true;
-    });
-
     target.addIllegalOp<ShapeCastOp>();
 
     // TODO: can we change it to addDynamicLegalOp?
@@ -962,8 +953,7 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
 
     // Ops to llvm.genx only Patterns
     patterns.add<NbarrierWaitPattern, CompilerHintPattern,
-                 ElementwiseToVCPattern<arith::MaximumFOp>,
-                 ElementwiseToVCPattern<math::ExpOp>, DpasPattern,
+                 ElementwiseToVCPattern<arith::MaximumFOp>, DpasPattern,
                  NbarrierArrivePattern>(patterns.getContext());
 
     // Ops to LSC only patterns
@@ -971,6 +961,10 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
 
     populateLoadStoreLSCPatterns(typeConverter, patterns);
 
+    populateMathToVCPatterns(typeConverter, patterns);
+
+    configureMathToVCConversionLegality(target);
+
     if (failed(applyPartialConversion(m, target, std::move(patterns))))
       return signalPassFailure();
   }
diff --git a/lib/Utils/CMakeLists.txt b/lib/Utils/CMakeLists.txt
index 7b6ad3baa..75dcf17cf 100644
--- a/lib/Utils/CMakeLists.txt
+++ b/lib/Utils/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_library(IMEXUtil
     FuncUtils.cpp
     PassUtils.cpp
     TypeConversion.cpp
+    VCUtils.cpp
     XeCommon.cpp
     XeArch.cpp
 
diff --git a/lib/Conversion/XeGPUToVC/Utils.cpp b/lib/Utils/VCUtils.cpp
similarity index 95%
rename from lib/Conversion/XeGPUToVC/Utils.cpp
rename to lib/Utils/VCUtils.cpp
index 7efc9efd0..f2c4f2c2f 100644
--- a/lib/Conversion/XeGPUToVC/Utils.cpp
+++ b/lib/Utils/VCUtils.cpp
@@ -8,10 +8,11 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file implements some utils used in XeGPUToVC pass
+/// This file implements some utils used in ConversionToVC passes
 ///
 //===----------------------------------------------------------------------===//
 
+#include "imex/Utils/VCUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -28,7 +29,7 @@ FlatSymbolRefAttr getFuncRefAttr(gpu::GPUModuleOp module, StringRef name,
                                  TypeRange resultType, ValueRange operands,
                                  bool isVectorComputeFunction,
                                  bool emitCInterface,
-                                 bool emitSPIRVLinkage = true) {
+                                 bool emitSPIRVLinkage /* = true*/) {
   MLIRContext *context = module.getContext();
   auto result = SymbolRefAttr::get(context, name);
 
diff --git a/lib/Utils/XeCommon.cpp b/lib/Utils/XeCommon.cpp
index e4a71c197..c701d9155 100644
--- a/lib/Utils/XeCommon.cpp
+++ b/lib/Utils/XeCommon.cpp
@@ -12,10 +12,10 @@
 /// routines used by Xe related dialects.
 ///
 //===----------------------------------------------------------------------===//
-
 #include <mlir/Dialect/Func/IR/FuncOps.h>
 #include <mlir/Dialect/GPU/IR/GPUDialect.h>
 #include <mlir/Dialect/SCF/IR/SCF.h>
+#include <unordered_set>
 
 #include "imex/Dialect/XeTile/IR/XeTileOps.h"
 #include "imex/Utils/DebugUtils.h"
@@ -211,4 +211,16 @@ mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
   return op;
 }
 
+/// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL
+/// capability. In other words, the vector size is not supported by SPIR-V.
+/// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16
+/// capability).
+bool isVectorAnyINTELType(mlir::Type type) {
+  std::unordered_set<int64_t> spirvSupportedSizes = {2, 3, 4, 8, 16};
+  auto vecType = mlir::dyn_cast<mlir::VectorType>(type);
+  return vecType && vecType.getRank() == 1 &&
+         (spirvSupportedSizes.find(vecType.getNumElements()) ==
+          spirvSupportedSizes.end());
+}
+
 } // namespace imex
diff --git a/test/Conversion/MathToVC/math-to-vc.mlir b/test/Conversion/MathToVC/math-to-vc.mlir
new file mode 100644
index 000000000..f0fb831dd
--- /dev/null
+++ b/test/Conversion/MathToVC/math-to-vc.mlir
@@ -0,0 +1,78 @@
+// RUN: imex-opt -convert-math-to-vc -verify-diagnostics  %s | FileCheck %s --check-prefixes=CHECK
+// RUN: imex-opt -convert-math-to-vc="enable-high-precision-interim-calculation=true" -verify-diagnostics %s | FileCheck %s --check-prefixes=HIGH_PRECISION
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @math_to_vc {
+    // CHECK-LABEL: gpu.func @ceil_f16
+    gpu.func @ceil_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[EXTF_F32:.*]] = arith.extf {{.*}} : vector<16xf16> to vector<16xf32>
+      // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndu.v16f32(%[[EXTF_F32]]) : (vector<16xf32>) -> vector<16xf32>
+      // CHECK-NEXT: %[[TRUNC_F16:.*]] = arith.truncf %[[CEILF]] : vector<16xf32> to vector<16xf16>
+      %2 = math.ceil %0 : vector<16xf16>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @ceil_f64
+    gpu.func @ceil_f64(%arg0: memref<8x16xf64>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf64>, vector<16xf64>
+      // CHECK: %[[TRUNCF_F32:.*]] = arith.truncf {{.*}} : vector<16xf64> to vector<16xf32>
+      // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndu.v16f32(%[[TRUNCF_F32]]) : (vector<16xf32>) -> vector<16xf32>
+      // CHECK-NEXT: %[[EXTF_F64:.*]] = arith.extf %[[CEILF]] : vector<16xf32> to vector<16xf64>
+      // expected-warning@+1 {{Truncation is done on input during conversion, may result in wrong result.}}
+      %2 = math.ceil %0 : vector<16xf64>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @floor_f16
+    gpu.func @floor_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[EXTF_F32:.*]] = arith.extf {{.*}} : vector<16xf16> to vector<16xf32>
+      // CHECK-NEXT: %[[CEILF:.*]] = func.call @llvm.genx.rndd.v16f32(%[[EXTF_F32]]) : (vector<16xf32>) -> vector<16xf32>
+      // CHECK-NEXT: %[[TRUNC_F16:.*]] = arith.truncf %[[CEILF]] : vector<16xf32> to vector<16xf16>
+      %2 = math.floor %0 : vector<16xf16>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @exp_f16
+    // HIGH_PRECISION-LABEL: gpu.func @exp_f16
+    gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      // HIGH_PRECISION: %[[VEC:.*]] = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf16>
+      // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
+      // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]])
+      // HIGH_PRECISION: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf32>
+      // HIGH_PRECISION: %[[VEC_F32:.*]] = arith.extf %[[VEC]] : vector<16xf16> to vector<16xf32>
+      // HIGH_PRECISION: %[[MULF_F32:.*]] = arith.mulf %[[VEC_F32]], %[[LOG2E_VEC]] : vector<16xf32>
+      // HIGH_PRECISION: %[[MULF:.*]] = arith.truncf %[[MULF_F32]] : vector<16xf32> to vector<16xf16>
+      // HIGH_PRECISION: func.call @llvm.genx.exp.v8i32(%[[MULF]])
+      %2 = math.exp %v1 : vector<16xf16>
+      gpu.return
+    }
+
+    // -----
+
+    // CHECK-LABEL: gpu.func @exp2_f16
+    gpu.func @exp2_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      // CHECK: %[[VEC:.*]] = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
+      // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[VEC]])
+      %2 = math.exp2 %v1 : vector<16xf16>
+      gpu.return
+    }
+  }
+}
+
+// -----
diff --git a/test/Conversion/XeGPUToVC/eltwise.mlir b/test/Conversion/XeGPUToVC/eltwise.mlir
index f177e0f2d..24d59e3ac 100644
--- a/test/Conversion/XeGPUToVC/eltwise.mlir
+++ b/test/Conversion/XeGPUToVC/eltwise.mlir
@@ -7,8 +7,7 @@ module @gemm attributes {gpu.container_module} {
       %c0 = arith.constant 0 : index
       %cv1 = arith.constant dense<1.0> : vector<16xf32>
       %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<16xf32>
-      // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f32
-      // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f32 to vector<16xf32>
+      // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf32>
       // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
       // CHECK-NEXT: func.call @llvm.genx.exp.v16f32(%[[MULF]])
       %1 = math.exp %v1 fastmath<nnan> : vector<16xf32>
@@ -31,8 +30,7 @@ module @gemm attributes {gpu.container_module} {
     gpu.func @exp_f16(%arg0: memref<8x16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
       %c0 = arith.constant 0 : index
       %v1 = vector.load %arg0[%c0, %c0] : memref<8x16xf16>, vector<16xf16>
-      // CHECK: %[[LOG2E:.*]] = arith.constant 1.44{{.*}} f16
-      // CHECK-NEXT: %[[LOG2E_VEC:.*]] = vector.broadcast %[[LOG2E]] : f16 to vector<16xf16>
+      // CHECK: %[[LOG2E_VEC:.*]] = arith.constant dense<1.44{{.*}}> : vector<16xf16>
       // CHECK-NEXT: %[[MULF:.*]] = arith.mulf {{.*}} %[[LOG2E_VEC]]
       // CHECK-NEXT: func.call @llvm.genx.exp.v8i32(%[[MULF]])
       %2 = math.exp %v1 : vector<16xf16>
diff --git a/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir b/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir
new file mode 100644
index 000000000..c6498e8d7
--- /dev/null
+++ b/test/Integration/Dialect/Gpu/ceil_floor_BF16.mlir
@@ -0,0 +1,53 @@
+// RUN: IMEX_USE_IGC_VECTOR_BACK_END=1 %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner  -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: IMEX_USE_IGC_VECTOR_BACK_END=1 %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/gpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner  -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+
+module @eltwise_add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01>
+  func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> {
+    %c20 = arith.constant 20 : index
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<10x20xbf16>
+    memref.copy %arg1, %memref : memref<10x20xbf16> to memref<10x20xbf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    memref.copy %arg0, %memref_0 : memref<10x20xbf16> to memref<10x20xbf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1)  args(%memref_0 : memref<10x20xbf16>, %memref : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>)
+    %alloc = memref.alloc() : memref<10x20xbf16>
+    memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16>
+    gpu.dealloc  %memref_1 : memref<10x20xbf16>
+    gpu.dealloc  %memref_0 : memref<10x20xbf16>
+    gpu.dealloc  %memref : memref<10x20xbf16>
+    return %alloc : memref<10x20xbf16>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL, Bfloat16ConversionINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute, SPV_INTEL_bfloat16_conversion]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 10, 20, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %block_id_x = gpu.block_id  x
+      %block_id_y = gpu.block_id  y
+      %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %2 = math.ceil %0 : bf16
+      %3 = math.floor %1 : bf16
+      %4 = arith.addf %2, %3 : bf16
+      memref.store %4, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      gpu.return
+    }
+  }
+  func.func @main() {
+    %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16>
+    %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16>
+    // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
+    // CHECK-COUNT-200: 1
+    call @printMemrefBF16(%cast) : (memref<*xbf16>) -> ()
+    return
+  }
+   func.func private @printMemrefBF16(memref<*xbf16>)  attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/Gpu/gpu-to-llvm.pp b/test/Integration/Dialect/Gpu/gpu-to-llvm.pp
index 9effd7727..5ed75515b 100644
--- a/test/Integration/Dialect/Gpu/gpu-to-llvm.pp
+++ b/test/Integration/Dialect/Gpu/gpu-to-llvm.pp
@@ -4,6 +4,7 @@
 // Ready for imex runner starting from GPU dialect.
 builtin.module(
     imex-vector-linearize
+    gpu.module(convert-math-to-vc{enable-high-precision-interim-calculation=true})
     reconcile-unrealized-casts
     bf16-to-gpu
     imex-convert-gpu-to-spirv
diff --git a/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir b/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir
new file mode 100644
index 000000000..9ee678a7c
--- /dev/null
+++ b/test/Integration/Dialect/XeGPU/ceil_floor_f32.mlir
@@ -0,0 +1,84 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0>
+  func.func @test(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32>
+    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg1, %memref_1 : memref<8x16xf32> to memref<8x16xf32>
+    %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c8, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_1 : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
+    gpu.dealloc  %memref_1 : memref<8x16xf32>
+    return %memref_2 : memref<8x16xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %thread_id_x = gpu.thread_id x
+      cf.br ^bb1
+    ^bb1:
+      %0 = xegpu.create_nd_tdesc %arg1[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %2 = xegpu.create_nd_tdesc %arg0[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      %3 = xegpu.load_nd %2  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %4 = math.ceil %1 : vector<16xf32>
+      %5 = math.floor %3 : vector<16xf32>
+      %6 = arith.addf %4, %5 : vector<16xf32>
+      %7 = xegpu.create_nd_tdesc %arg2[%thread_id_x, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<16xf32>
+      xegpu.store_nd %6, %7  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c_gen_int = arith.constant 0 : i1
+    %cf_lower = arith.constant -0.5 : f32
+    %cf_upper = arith.constant 0.5 : f32
+
+    %A = memref.alloc() : memref<8x16xf32>
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
+
+    %B = memref.alloc() : memref<8x16xf32>
+    %B_random = memref.cast %B : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%B_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
+
+    // calculate the result C matrix
+    %c16 = arith.constant 16 : index
+    %c8 = arith.constant 8 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %ref = memref.alloc() : memref<8x16xf32>
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c0 to %c16 step %c1 {
+        %a = memref.load %A[%i, %j] : memref<8x16xf32>
+        %b = memref.load %B[%i, %j] : memref<8x16xf32>
+        %a_ceiled = math.ceil %a : f32
+        %b_floored = math.floor %b : f32
+        %c = arith.addf %a_ceiled, %b_floored : f32
+        memref.store %c, %ref[%i, %j] : memref<8x16xf32>
+      }
+    }
+
+    %C = call @test(%A, %B) : (memref<8x16xf32>, memref<8x16xf32>) -> memref<8x16xf32>
+
+    %C_cast = memref.cast %C : memref<8x16xf32> to memref<*xf32>
+    %ref_cast = memref.cast %ref : memref<8x16xf32> to memref<*xf32>
+    // call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
+    // CHECK: [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%ref_cast, %C_cast) : (memref<*xf32>, memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp
index c3ca033ea..faa5343c5 100644
--- a/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp
+++ b/test/Integration/Dialect/XeGPU/xegpu-to-func-vc.pp
@@ -7,6 +7,7 @@
         imex-xegpu-apply-vnni-transformation,
         imex-xegpu-optimize-transpose)
     imex-vector-linearize
+    gpu.module(convert-math-to-vc{enable-high-precision-interim-calculation=true})
     gpu.module(convert-xegpu-to-vc)
     cse
     reconcile-unrealized-casts