[BACKEND] Fix invalid intermediate IR in GPU to LLVM (#1810)

arith.trunc op is not allowed to use index type. This causes the IR to fail the verifier. This doesn't cause a compilation failure as index are lowered to i32 in the same pass. However this creates intermediate IR that fails verifier which can make things harder to debug.
triton-lang · Jun 21, 2023 · c3cba05 · c3cba05
1 parent 4c0e3d9
commit c3cba05
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 5 deletions.
diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -1180,6 +1180,35 @@ struct AbsFOpConversion
   }
 };
 
+/// The lowering of index_cast becomes an integer conversion since index
+/// becomes an integer.  If the bit width of the source and target integer
+/// types is the same, just erase the cast.  If the target type is wider,
+/// sign-extend the value, otherwise truncate it.
+struct IndexCastOpLowering
+    : public ElementwiseOpConversionBase<arith::IndexCastOp,
+                                         IndexCastOpLowering> {
+  using Base =
+      ElementwiseOpConversionBase<arith::IndexCastOp, IndexCastOpLowering>;
+  using Base::Base;
+  using Adaptor = typename Base::OpAdaptor;
+
+  Value createDestOp(arith::IndexCastOp op, OpAdaptor adaptor,
+                     ConversionPatternRewriter &rewriter, Type elemTy,
+                     ValueRange operands, Location loc) const {
+    auto inElemTy =
+        this->getTypeConverter()->convertType(getElementType(op.getIn()));
+    unsigned targetBits = elemTy.getIntOrFloatBitWidth();
+    unsigned sourceBits = inElemTy.getIntOrFloatBitWidth();
+
+    if (targetBits == sourceBits)
+      return operands[0];
+    if (targetBits < sourceBits)
+      return rewriter.replaceOpWithNewOp<LLVM::TruncOp>(op, elemTy,
+                                                        operands[0]);
+    return rewriter.replaceOpWithNewOp<LLVM::SExtOp>(op, elemTy, operands[0]);
+  }
+};
+
 void populateElementwiseOpToLLVMPatterns(
     TritonGPUToLLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     PatternBenefit benefit) {
@@ -1240,6 +1269,7 @@ void populateElementwiseOpToLLVMPatterns(
   patterns.add<TruncFOpConversion>(typeConverter, benefit);
   patterns.add<FPToSIOpConversion>(typeConverter, benefit);
   patterns.add<SIToFPOpConversion>(typeConverter, benefit);
+  patterns.add<IndexCastOpLowering>(typeConverter, benefit);
 
   patterns.add<FpToFpOpConversion>(typeConverter, benefit);
 

diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
@@ -388,7 +388,7 @@ struct GetProgramIdOpConversion
 
     Value blockId =
         rewriter.create<::mlir::gpu::BlockIdOp>(loc, dims[op.getAxisAsInt()]);
-    rewriter.replaceOpWithNewOp<arith::TruncIOp>(op, i32_ty, blockId);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, i32_ty, blockId);
     return success();
   }
 
@@ -410,7 +410,7 @@ struct GetNumProgramsOpConversion
 
     Value blockId =
         rewriter.create<::mlir::gpu::GridDimOp>(loc, dims[op.getAxis()]);
-    rewriter.replaceOpWithNewOp<arith::TruncIOp>(op, i32_ty, blockId);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, i32_ty, blockId);
 
     return success();
   }

diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h
@@ -217,10 +217,9 @@ class ConvertTritonGPUOpToLLVMPatternBase {
   }
 
   Value getThreadId(ConversionPatternRewriter &rewriter, Location loc) const {
-    auto llvmIndexTy = this->getTypeConverter()->getIndexType();
     auto tid = rewriter.create<::mlir::gpu::ThreadIdOp>(
-        loc, rewriter.getIndexType(), ::mlir::gpu::Dimension::x);
-    return rewriter.create<arith::TruncIOp>(loc, i32_ty, tid);
+        loc, ::mlir::gpu::Dimension::x);
+    return rewriter.create<arith::IndexCastOp>(loc, i32_ty, tid);
   }
 
   // -----------------------------------------------------------------------