From 72eec1cff90750a3a295494d85eaa64b4ddba0e4 Mon Sep 17 00:00:00 2001
From: Yan Xu <yancey.yx@alibaba-inc.com>
Date: Fri, 28 Jun 2024 08:07:31 +0800
Subject: [PATCH] add view op dynamic shape propagate (#1303)

add reshape/slice/dot op shape propagate
---
 .../disc/transforms/disc_shape_propagate.cc   | 323 ++++++++++++++----
 .../tests/disc-shape-propagate.mlir           |  84 ++++-
 2 files changed, 332 insertions(+), 75 deletions(-)
diff --git a/tao_compiler/mlir/disc/transforms/disc_shape_propagate.cc b/tao_compiler/mlir/disc/transforms/disc_shape_propagate.cc
index 64156322509..a17817b83a5 100644
--- a/tao_compiler/mlir/disc/transforms/disc_shape_propagate.cc
+++ b/tao_compiler/mlir/disc/transforms/disc_shape_propagate.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // This file implements the logic to do some shape optimizations on tensor
 // level.
 #include <chrono>
+#include <numeric>
+#include <stack>
 #include <unordered_set>
 #include <utility>
 
@@ -47,14 +49,12 @@ namespace mlir {
 namespace disc_ral {
 
 using ::mlir::func::FuncOp;
-
 namespace {
 std::string kDynamicDimsAttr = "input_dynamic_dims";
 struct ShapeContext {
   ShapeContext() = default;
   ShapeContext(Value value, SmallVector<int64_t> shape)
       : value(value), shape(shape){};
-
   Value value;
   SmallVector<int64_t> shape;
 };
@@ -71,15 +71,18 @@ struct DiscShapePropagatePass
     registry.insert<arith::ArithDialect>();
     registry.insert<mhlo::MhloDialect>();
   }
+  void visitOperator(ModuleOp& m, OpBuilder& rewriter, Operation* op,
+                     std::stack<ShapeContext>& ctxStack);
   void runOnOperation() override;
 };
 bool isBinaryOp(Operation* op) {
-  return isa<mhlo::AddOp>(*op) || isa<mhlo::CompareOp>(*op) ||
-         isa<mhlo::SelectOp>(*op) || isa<mhlo::ConvertOp>(*op);
+  return isa<mhlo::AddOp, mhlo::CompareOp, mhlo::SelectOp, mhlo::PowOp,
+             mhlo::SubtractOp, mhlo::MulOp, mhlo::DivOp, mhlo::MaxOp>(op);
 }
 
 bool isUnaryOp(Operation* op) {
-  return isa<mhlo::ConvertOp, mhlo::ScatterOp>(op);
+  return isa<mhlo::ConvertOp, mhlo::ScatterOp, mhlo::SqrtOp, mhlo::LogisticOp,
+             mhlo::RsqrtOp, mhlo::NegOp, mhlo::ExpOp, mhlo::LogOp>(op);
 }
 bool isConcreteShape(ShapeContext& ctx) {
   for (auto dim : ctx.shape) {
@@ -109,29 +112,31 @@ std::optional<Value> getConstTensor(OpBuilder& b, Operation* op,
 
 std::optional<ShapeContext> HandleBinaryOp(OpBuilder& b, Operation* op,
                                            ShapeContext& inputCtx) {
-  if (!isBinaryOp(op)) return std::nullopt;
-  if (op->getOperand(1).isa<BlockArgument>()) {
+  auto bcastOp = dyn_cast_or_null<mhlo::BroadcastInDimOp>(
+      op->getOperand(1).getDefiningOp());
+  if (!bcastOp) {
     return ShapeContext(op->getResult(0), inputCtx.shape);
   }
-  if (auto const_op =
-          dyn_cast<mhlo::ConstantOp>(op->getOperand(1).getDefiningOp())) {
+  if (bcastOp) {
+    auto constOp = dyn_cast_or_null<mhlo::ConstantOp>(
+        bcastOp->getOperand(0).getDefiningOp());
+    if (!constOp) {
+      return ShapeContext(op->getResult(0), inputCtx.shape);
+    }
     auto elemTy =
         op->getOperand(0).getType().cast<RankedTensorType>().getElementType();
     b.setInsertionPoint(op);
-    auto dense_attr = const_op.getValue().dyn_cast<mlir::DenseElementsAttr>();
-    int64_t value = (*dense_attr.getValues<APInt>().begin()).getSExtValue();
+    auto dense_attr = constOp.getValue().dyn_cast<mlir::DenseElementsAttr>();
+    int64_t value = dense_attr.getValues<int64_t>()[0];
     auto scalar_const_op = getConstTensor(b, op, {value}, {});
     Value inputShape =
         b.create<shape::ShapeOfOp>(op->getLoc(), op->getOperand(0));
     auto rank = inputCtx.shape.size();
-    SmallVector<int64_t> boradcast_dim;
-    boradcast_dim.push_back(static_cast<int64_t>(rank));
 
-    auto bcast_op = b.create<mhlo::DynamicBroadcastInDimOp>(
+    auto dynBcastOp = b.create<mhlo::DynamicBroadcastInDimOp>(
         op->getLoc(), RankedTensorType::get(inputCtx.shape, elemTy),
         scalar_const_op.value(), inputShape, b.getI64TensorAttr({}));
-    const_op.getResult().replaceAllUsesWith(bcast_op.getResult());
-    const_op.erase();
+    bcastOp.getResult().replaceAllUsesWith(dynBcastOp.getResult());
   }
   return ShapeContext(op->getResult(0), inputCtx.shape);
 }
@@ -156,19 +161,149 @@ std::optional<ShapeContext> propagateHelper<tensor::DimOp>(
 template <>
 std::optional<ShapeContext> propagateHelper<mhlo::DotOp>(
     OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
-  auto dot_op = dyn_cast_or_null<mhlo::DotOp>(op);
+  auto dot_op = dyn_cast<mhlo::DotOp>(op);
   if (!dot_op) return std::nullopt;
+  auto lhs = dot_op.getOperand(0);
+  auto rhs = dot_op.getOperand(1);
+  if (inputCtx.value == lhs) {
+    return ShapeContext(op->getResult(0),
+                        {inputCtx.shape[0],
+                         rhs.getType().cast<RankedTensorType>().getShape()[1]});
+  } else {
+    return ShapeContext(op->getResult(0),
+                        {lhs.getType().cast<RankedTensorType>().getShape()[0],
+                         inputCtx.shape[1]});
+  }
+}
 
-  auto lhs_shape =
-      dot_op.getOperand(0).getType().cast<RankedTensorType>().getShape();
-  auto rhs_shape =
-      dot_op.getOperand(1).getType().cast<RankedTensorType>().getShape();
-  auto result_shape =
-      dot_op.getResult().getType().cast<RankedTensorType>().getShape();
-  SmallVector<int64_t> new_shape;
-  new_shape.push_back(lhs_shape[0]);
-  new_shape.push_back(rhs_shape[1]);
-  return ShapeContext(op->getResult(0), new_shape);
+template <>
+std::optional<ShapeContext> propagateHelper<mhlo::ReshapeOp>(
+    OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
+  auto reshape_op = dyn_cast<mhlo::ReshapeOp>(op);
+  if (!reshape_op) return std::nullopt;
+  Type intType = b.getIntegerType(32);
+  int rank =
+      reshape_op.getOperand().getType().cast<RankedTensorType>().getRank();
+  auto resultRankType =
+      reshape_op.getResult().getType().cast<RankedTensorType>();
+  auto resultRank = resultRankType.getRank();
+  auto resultShape = resultRankType.getShape();
+  SmallVector<int64_t> newShape(resultRank, ShapedType::kDynamic);
+  int64_t numel =
+      std::accumulate(inputCtx.shape.begin(), inputCtx.shape.end(), int64_t(1),
+                      [](int64_t acc, int64_t num) {
+                        return num == ShapedType::kDynamic ? acc : acc * num;
+                      });
+
+  bool inferenced = true;
+  while (inferenced) {
+    inferenced = false;
+    // set concret shape if possible
+    for (size_t i = 0; i < resultRank; ++i) {
+      for (size_t j = 0; j < rank; ++j) {
+        if (newShape[i] == ShapedType::kDynamic &&
+            resultShape[i] == inputCtx.shape[j]) {
+          newShape[i] = inputCtx.shape[j];
+          numel /= inputCtx.shape[j];
+          inferenced = true;
+        }
+      }
+    }
+    for (size_t d = 0; d < resultRank; ++d) {
+      if (newShape[d] == ShapedType::kDynamic) {
+        if (numel % resultShape[d] == 0) {
+          numel /= resultShape[d];
+          newShape[d] = resultShape[d];
+          inferenced = true;
+        }
+      }
+    }
+  }
+  // more then one dynamic dims is invalid, let's try to use the concret shape
+  // to fill the dynamic dims
+  int dynDims =
+      std::count(newShape.begin(), newShape.end(), ShapedType::kDynamic);
+  for (size_t i = 0; i < resultRank; ++i) {
+    if (newShape[i] == ShapedType::kDynamic && dynDims > 1) {
+      newShape[i] = resultShape[i];
+      dynDims--;
+      break;
+    }
+  }
+  SmallVector<Value, 4> newShapeValues;
+  for (int64_t dim : newShape) {
+    if (dim == ShapedType::kDynamic) {
+      // caculate the dimension
+      newShapeValues.push_back(
+          b.create<arith::ConstantIndexOp>(op->getLoc(), -1));
+    } else {
+      newShapeValues.push_back(
+          b.create<arith::ConstantIndexOp>(op->getLoc(), dim));
+    }
+  }
+  Value shapeValue =
+      b.create<tensor::FromElementsOp>(op->getLoc(), newShapeValues);
+
+  auto shape = b.create<shape::ShapeOfOp>(op->getLoc(), op->getOperand(0));
+  auto numElems = b.create<shape::NumElementsOp>(op->getLoc(), shape);
+
+  auto computeReshapeShape = b.create<mhlo::ComputeReshapeShapeOp>(
+      op->getLoc(), shapeValue.getType(), numElems.getResult(), shapeValue);
+  auto dynReshapeOpResultType =
+      RankedTensorType::get(newShape, resultRankType.getElementType());
+  auto dynReshapeOp = b.create<mhlo::DynamicReshapeOp>(
+      op->getLoc(), dynReshapeOpResultType, reshape_op.getOperand(),
+      computeReshapeShape);
+  op->getResult(0).replaceAllUsesWith(dynReshapeOp.getResult());
+  op->erase();
+  return ShapeContext(dynReshapeOp->getResult(0), newShape);
+}
+
+template <>
+std::optional<ShapeContext> propagateHelper<mhlo::SliceOp>(
+    OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
+  auto slice_op = dyn_cast<mhlo::SliceOp>(op);
+  if (!slice_op) return std::nullopt;
+  b.setInsertionPoint(op);
+  auto loc = slice_op.getLoc();
+  auto rankType = slice_op.getOperand().getType().cast<RankedTensorType>();
+
+  auto inputShape = rankType.getShape();
+  auto rank = rankType.getRank();
+  SmallVector<Value, 4> startIndices(rank);
+  SmallVector<Value, 4> limitIndices(rank);
+  SmallVector<Value, 4> strides(rank);
+  SmallVector<int64_t> newShape(rank);
+  for (size_t i = 0; i < rankType.getRank(); ++i) {
+    auto startIndicesCst = slice_op.getStartIndices().getValues<int64_t>()[i];
+    auto limitIndicesCst = slice_op.getLimitIndices().getValues<int64_t>()[i];
+    auto stridesCst = slice_op.getStrides().getValues<int64_t>()[i];
+    startIndices[i] =
+        b.create<arith::ConstantIndexOp>(slice_op.getLoc(), startIndicesCst);
+    // using dynamic dim if limitIndices is the same as input shape
+    if (limitIndicesCst == inputShape[i] &&
+        inputCtx.shape[i] == ShapedType::kDynamic) {
+      limitIndices[i] = b.create<tensor::DimOp>(loc, slice_op.getOperand(), i);
+      newShape[i] = inputCtx.shape[i];
+    } else {
+      limitIndices[i] =
+          b.create<arith::ConstantIndexOp>(slice_op.getLoc(), limitIndicesCst);
+      newShape[i] = (limitIndicesCst - startIndicesCst - 1) / stridesCst + 1;
+    }
+    strides[i] =
+        b.create<arith::ConstantIndexOp>(slice_op.getLoc(), stridesCst);
+  }
+  Value baseIndicesValue = b.create<tensor::FromElementsOp>(loc, startIndices);
+  Value stridesValue = b.create<tensor::FromElementsOp>(loc, strides);
+  Value limitIndicesValue = b.create<tensor::FromElementsOp>(loc, limitIndices);
+  auto sliceOpResultType =
+      RankedTensorType::get(newShape, rankType.getElementType());
+  auto dyncSliceOp = b.create<mhlo::RealDynamicSliceOp>(
+      loc, sliceOpResultType, slice_op.getOperand(), baseIndicesValue,
+      limitIndicesValue, stridesValue);
+  op->getResult(0).replaceAllUsesWith(dyncSliceOp.getResult());
+  op->erase();
+  return ShapeContext(dyncSliceOp->getResult(0), newShape);
 }
 
 template <>
@@ -223,6 +358,25 @@ std::optional<ShapeContext> propagateHelper<mhlo::TransposeOp>(
 
   return ShapeContext(op->getResult(0), new_shape);
 }
+template <>
+std::optional<ShapeContext> propagateHelper<mhlo::DotGeneralOp>(
+    OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
+  auto dot_general_op = dyn_cast_or_null<mhlo::DotGeneralOp>(op);
+  if (!dot_general_op) return std::nullopt;
+  auto lhs = dot_general_op.getOperand(0);
+  auto rhs = dot_general_op.getOperand(1);
+  if (inputCtx.value == lhs) {
+    return ShapeContext(op->getResult(0),
+                        {rhs.getType().cast<RankedTensorType>().getShape()[0],
+                         inputCtx.shape[1],
+                         rhs.getType().cast<RankedTensorType>().getShape()[2]});
+  } else {
+    return ShapeContext(op->getResult(0),
+                        {lhs.getType().cast<RankedTensorType>().getShape()[0],
+                         lhs.getType().cast<RankedTensorType>().getShape()[1],
+                         inputCtx.shape[2]});
+  }
+}
 
 template <>
 std::optional<ShapeContext> propagateHelper<mhlo::ReduceOp>(
@@ -293,6 +447,31 @@ std::optional<ShapeContext> propagateHelper<mhlo::DynamicGatherOp>(
 
   return ShapeContext(op->getResult(0), new_shape);
 }
+template <>
+std::optional<ShapeContext> propagateHelper<mhlo::DynamicReshapeOp>(
+    OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
+  auto resultShape =
+      op->getResult(0).getType().cast<RankedTensorType>().getShape();
+  SmallVector<int64_t> newShape(resultShape.begin(), resultShape.end());
+  return ShapeContext(op->getResult(0), newShape);
+}
+
+template <>
+std::optional<ShapeContext> propagateHelper<mhlo::RealDynamicSliceOp>(
+    OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
+  auto resultShape =
+      op->getResult(0).getType().cast<RankedTensorType>().getShape();
+  SmallVector<int64_t> newShape(resultShape.begin(), resultShape.end());
+  return ShapeContext(op->getResult(0), newShape);
+}
+template <>
+std::optional<ShapeContext> propagateHelper<mhlo::DynamicBroadcastInDimOp>(
+    OpBuilder& b, Operation* op, ShapeContext& inputCtx) {
+  auto resultShape =
+      op->getResult(0).getType().cast<RankedTensorType>().getShape();
+  SmallVector<int64_t> newShape(resultShape.begin(), resultShape.end());
+  return ShapeContext(op->getResult(0), newShape);
+}
 
 template <>
 std::optional<ShapeContext> propagateHelper<mhlo::GatherOp>(
@@ -345,7 +524,7 @@ std::optional<ShapeContext> propagateHelper<mhlo::GatherOp>(
     }
 
     if (include_this_dim && src_shape[dim_idx] == dim_size.getSExtValue()) {
-      new_shape.push_back(ShapedType::kDynamic);
+      new_shape.push_back(dim_size.getSExtValue());
     } else if (include_this_dim &&
                src_shape[dim_idx] != dim_size.getSExtValue()) {
       new_shape.push_back(dim_size.getSExtValue());
@@ -409,6 +588,7 @@ LogicalResult parseInputDynamicDims(
 }
 
 void applyShapeContext(ShapeContext& ctx) {
+  if (!ctx.value) return;
   auto res_ty = ctx.value.getType().dyn_cast<RankedTensorType>();
   if (!res_ty) return;
   auto elemTy = res_ty.getElementType();
@@ -420,46 +600,69 @@ std::optional<ShapeContext> propagateOpShape(OpBuilder& rewriter, Operation* op,
   if (isUnaryOp(op)) {
     return ShapeContext(op->getResult(0), inputCtx.shape);
   }
-  if (auto ctx = HandleBinaryOp(rewriter, op, inputCtx)) {
-    return ctx;
-  }
-  using PropagationFunc =
-      std::optional<ShapeContext> (*)(OpBuilder&, Operation*, ShapeContext&);
-  const std::vector<PropagationFunc> propagationFunctions = {
-      propagateHelper<mhlo::ConcatenateOp>,
-      propagateHelper<mhlo::DotOp>,
-      propagateHelper<mhlo::DynamicGatherOp>,
-      propagateHelper<mhlo::GatherOp>,
-      propagateHelper<mhlo::ReduceOp>,
-      propagateHelper<mhlo::TransposeOp>,
-      propagateHelper<tensor::DimOp>,
-  };
-  // Iterate over the propagation functions and apply each one
-  for (const auto& propagate : propagationFunctions) {
-    if (auto ctx = propagate(rewriter, op, inputCtx)) {
-      return ctx;
-    }
+  if (isBinaryOp(op)) {
+    return HandleBinaryOp(rewriter, op, inputCtx);
   }
+  if (isa<tensor::DimOp>(op)) {
+    return propagateHelper<tensor::DimOp>(rewriter, op, inputCtx);
+  }
+  if (isa<mhlo::BroadcastInDimOp>(op)) {
+    return ShapeContext(op->getResult(0), inputCtx.shape);
+  }
+#define PROPAGATE_OP_HANDLER(opType)                              \
+  if (auto t##opType = dyn_cast<mhlo::opType>(op)) {              \
+    rewriter.setInsertionPoint(op);                               \
+    return propagateHelper<mhlo::opType>(rewriter, op, inputCtx); \
+  }
+  PROPAGATE_OP_HANDLER(DotOp);
+  PROPAGATE_OP_HANDLER(SliceOp);
+  PROPAGATE_OP_HANDLER(ReshapeOp);
+  PROPAGATE_OP_HANDLER(ConcatenateOp);
+  PROPAGATE_OP_HANDLER(ReduceOp);
+  PROPAGATE_OP_HANDLER(TransposeOp);
+  PROPAGATE_OP_HANDLER(GatherOp);
+  PROPAGATE_OP_HANDLER(DynamicGatherOp);
+  PROPAGATE_OP_HANDLER(DotGeneralOp);
+  PROPAGATE_OP_HANDLER(DynamicReshapeOp);
+  PROPAGATE_OP_HANDLER(RealDynamicSliceOp);
+  PROPAGATE_OP_HANDLER(DynamicBroadcastInDimOp);
+  // PROPAGATE_OP_HANDLER(DimOp);
+#undef PROPAGATE_OP_HANDLER
   return std::nullopt;
-}
+}  // namespace
+
+bool shouldStopPropagation(Operation* op, ShapeContext& ctx) {
+  if (isConcreteShape(ctx)) return true;
+  if (isa<func::ReturnOp, mhlo::PadOp, shape::ShapeOfOp, tensor::DimOp>(op))
+    return true;
+  if (isa<mhlo::ReduceOp>(op->getParentOp())) return true;
 
-void visitOperator(ModuleOp& m, OpBuilder& rewriter, Operation* op,
-                   ShapeContext& ctx) {
-  if (isConcreteShape(ctx)) return;
-  // later to process return operators
-  if (isa<func::ReturnOp>(op)) return;
+  return false;
+}
 
+void DiscShapePropagatePass::visitOperator(ModuleOp& m, OpBuilder& rewriter,
+                                           Operation* op,
+                                           std::stack<ShapeContext>& ctxStack) {
+  auto ctx = ctxStack.top();
+  if (shouldStopPropagation(op, ctx)) {
+    return;
+  }
   auto resultShapeCtx = propagateOpShape(rewriter, op, ctx);
   if (!resultShapeCtx) {
-    m.emitError("failed update shape context on op:" +
+    m.emitError("failed propagate shape on op:" +
                 op->getName().stripDialect().str());
+    signalPassFailure();
     return;
   }
-
-  for (auto user : op->getResult(0).getUsers()) {
-    visitOperator(m, rewriter, user, resultShapeCtx.value());
+  ctxStack.push(*resultShapeCtx);
+  SmallVector<Operation*, 4> ctxUsers(resultShapeCtx->value.getUsers().begin(),
+                                      resultShapeCtx->value.getUsers().end());
+  for (size_t i = 0; i < ctxUsers.size(); ++i) {
+    visitOperator(m, rewriter, ctxUsers[i], ctxStack);
   }
-  applyShapeContext(*resultShapeCtx);
+  auto context = ctxStack.top();
+  ctxStack.pop();
+  applyShapeContext(context);
 }
 
 void DiscShapePropagatePass::runOnOperation() {
@@ -495,10 +698,12 @@ void DiscShapePropagatePass::runOnOperation() {
     for (auto dim : pair.second) {
       newShape[dim] = ShapedType::kDynamic;
     }
+    std::stack<ShapeContext> ctxStack;
     ShapeContext ctx(value, newShape);
+    ctxStack.push(ctx);
     auto newType = RankedTensorType::get(newShape, ty.getElementType());
     for (auto user : main.getArgument(argIdx).getUsers()) {
-      visitOperator(m, rewriter, user, ctx);
+      visitOperator(m, rewriter, user, ctxStack);
     }
     new_arg_types[argIdx] = newType;
     applyShapeContext(ctx);
diff --git a/tao_compiler/mlir/disc/transforms/tests/disc-shape-propagate.mlir b/tao_compiler/mlir/disc/transforms/tests/disc-shape-propagate.mlir
index 57b52fffb02..82160e066f8 100755
--- a/tao_compiler/mlir/disc/transforms/tests/disc-shape-propagate.mlir
+++ b/tao_compiler/mlir/disc/transforms/tests/disc-shape-propagate.mlir
@@ -10,18 +10,8 @@ func.func @main(%arg0: tensor<4x101xi64>, %arg1: tensor<4x101xi64>) -> tensor<4x
 
 // -----
 // CHECK-LABEL: main
-func.func @main(%arg0: tensor<4x101xi64>) -> tensor<4x101xi1> attributes{tf.entry_function = {input_dynamic_dims = "0:1"}}{
-  // CHECK: %1 = shape.shape_of %arg0 : tensor<4x?xi64> -> tensor<2xindex>
-  // CHECK: %2 = "mhlo.dynamic_broadcast_in_dim"(%0, %1) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i64>, tensor<2xindex>) -> tensor<4x?xi64>
-  %0 = mhlo.constant dense<0> : tensor<4x101xi64>
-  %1 = "mhlo.compare"(%arg0, %0) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<4x101xi64>, tensor<4x101xi64>) -> tensor<4x101xi1>
-  return %1 : tensor<4x101xi1>
-}
-
-// -----
-// CHECK-LABEL: main
-func.func @main(%arg0: tensor<4x101x32x128xbf16>) -> tensor<4x32x101x128xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:0,1"}}{
-  // CHECK: %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>, result_layout = dense<[3, 1, 2, 0]> : tensor<4xindex>, xla_shape = "bf16[4,32,101,128]{3,1,2,0}"} : (tensor<?x?x32x128xbf16>) -> tensor<?x32x?x128xbf16>
+func.func @main(%arg0: tensor<4x101x32x128xbf16>) -> tensor<4x32x101x128xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:1"}}{
+  // CHECK: %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>, result_layout = dense<[3, 1, 2, 0]> : tensor<4xindex>, xla_shape = "bf16[4,32,101,128]{3,1,2,0}"} : (tensor<4x?x32x128xbf16>) -> tensor<4x32x?x128xbf16>
   %1 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>, result_layout = dense<[3, 1, 2, 0]> : tensor<4xindex>, xla_shape = "bf16[4,32,101,128]{3,1,2,0}"} : (tensor<4x101x32x128xbf16>) -> tensor<4x32x101x128xbf16>
   return %1 : tensor<4x32x101x128xbf16>
 }
@@ -127,7 +117,7 @@ func.func @main(%arg0: tensor<32001x4096xf32>, %arg1: tensor<4x101x1xi64>) -> te
   // CHECK: %dim = tensor.dim %arg0, %c1 : tensor<32001x?xf32>
   // CHECK: %0 = arith.index_cast %dim : index to i64
   // CHECK: %from_elements = tensor.from_elements %c1_i64, %0 : tensor<2xi64>
-  // CHECK: %1 = "mhlo.dynamic_gather"(%arg0, %arg1, %from_elements) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x?xf32>, tensor<?x101x1xi64>, tensor<2xi64>) -> tensor<?x101x?xf32>
+  // CHEC:  %1 = "mhlo.dynamic_gather"(%arg0, %arg1, %from_elements) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x?xf32>, tensor<?x101x1xi64>, tensor<2xi64>) -> tensor<?x101x4096xf32>
   %1 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 4096]> : tensor<2xi64>} : (tensor<32001x4096xf32>, tensor<4x101x1xi64>) -> tensor<4x101x4096xf32>
   return %1 : tensor<4x101x4096xf32>
 }
@@ -136,7 +126,7 @@ func.func @main(%arg0: tensor<32001x4096xf32>, %arg1: tensor<4x101x1xi64>) -> te
 // CHECK-LABEL: main
 func.func @main(%arg0: tensor<32001x4096xf32>, %arg1: tensor<4x101x1xi64>) -> tensor<4x101x4096xf32> attributes{tf.entry_function = {input_dynamic_dims = "1:0,1"}}{
   // CHECK: %cst = arith.constant dense<[1, 4096]> : tensor<2xi64>
-  // CHECK: %0 = "mhlo.dynamic_gather"(%arg0, %arg1, %cst) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x4096xf32>, tensor<?x?x1xi64>, tensor<2xi64>) -> tensor<?x?x?xf32>
+  // CHECK: %0 = "mhlo.dynamic_gather"(%arg0, %arg1, %cst) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x4096xf32>, tensor<?x?x1xi64>, tensor<2xi64>) -> tensor<?x?x4096xf32>
   %1 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 4096]> : tensor<2xi64>} : (tensor<32001x4096xf32>, tensor<4x101x1xi64>) -> tensor<4x101x4096xf32>
   return %1 : tensor<4x101x4096xf32>
 }
@@ -145,7 +135,7 @@ func.func @main(%arg0: tensor<32001x4096xf32>, %arg1: tensor<4x101x1xi64>) -> te
 // CHECK-LABEL: main
 func.func @main(%arg0: tensor<32001x4096xf32>, %arg1: tensor<4x101x1xi64>) -> tensor<4x101x4096xf32> attributes{tf.entry_function = {input_dynamic_dims = "1:0"}}{
   // CHECK: %cst = arith.constant dense<[1, 4096]> : tensor<2xi64>
-  // CHECK: %0 = "mhlo.dynamic_gather"(%arg0, %arg1, %cst) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x4096xf32>, tensor<?x101x1xi64>, tensor<2xi64>) -> tensor<?x101x?xf32>
+  // CHECK: %0 = "mhlo.dynamic_gather"(%arg0, %arg1, %cst) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x4096xf32>, tensor<?x101x1xi64>, tensor<2xi64>) -> tensor<?x101x4096xf32>
   %1 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 4096]> : tensor<2xi64>} : (tensor<32001x4096xf32>, tensor<4x101x1xi64>) -> tensor<4x101x4096xf32>
   return %1 : tensor<4x101x4096xf32>
 }
@@ -166,4 +156,66 @@ func.func @main(%arg0: tensor<32001x4096xf32>, %arg1: tensor<4x101x1xi64>) -> te
   // CHECK: %0 = "mhlo.dynamic_gather"(%arg0, %arg1, %cst) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<32001x?xf32>, tensor<?x101x1xi64>, tensor<2xi64>) -> tensor<?x101x2048xf32>
   %1 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 2048]> : tensor<2xi64>} : (tensor<32001x4096xf32>, tensor<4x101x1xi64>) -> tensor<4x101x2048xf32>
   return %1 : tensor<4x101x2048xf32>
-}
\ No newline at end of file
+}
+
+
+// -----
+// CHECK-LABEL: main
+func.func @main(%arg0: tensor<4x32x101x128xbf16>) -> tensor<4x32x101x64xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:2"}}{
+  // %0 = mhlo.real_dynamic_slice %arg0, %from_elements, %from_elements_7, %from_elements_6 : (tensor<4x32x?x128xbf16>, tensor<4xindex>, tensor<4xindex>, tensor<4xindex>) -> tensor<4x32x?x64xbf16
+  %140 = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 32, 101, 64]> : tensor<4xi64>, start_indices = dense<0> : tensor<4xi64>, strides = dense<1> : tensor<4xi64>} : (tensor<4x32x101x128xbf16>) -> tensor<4x32x101x64xbf16>
+  return %140 : tensor<4x32x101x64xbf16>
+}
+
+// -----
+// CHECK-LABEL: main
+func.func @main(%arg0: tensor<1x101x128xbf16>) -> tensor<101x128xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:1"}}{
+  // CHECK: %3 = mhlo.dynamic_reshape %arg0, %2 : (tensor<1x?x128xbf16>, tensor<2xindex>) -> tensor<?x128xbf16>
+  %0 = mhlo.reshape %arg0: (tensor<1x101x128xbf16>) -> tensor<101x128xbf16>
+  return %0: tensor<101x128xbf16>
+}
+
+// -----
+// CHECK-LABEL: main
+func.func @main(%arg0: tensor<101x128xbf16>) -> tensor<1x101x128xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:1"}}{
+  // CHECK: %3 = mhlo.dynamic_reshape %arg0, %2 : (tensor<101x?xbf16>, tensor<3xindex>) -> tensor<1x101x?xbf16>
+  %0 = mhlo.reshape %arg0: (tensor<101x128xbf16>) -> tensor<1x101x128xbf16>
+  return %0: tensor<1x101x128xbf16>
+}
+
+// -----
+// CHECK-LABEL: main
+func.func @main(%arg0: tensor<4x101x32x128xbf16>) -> tensor<404x4096xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:1"}}{
+  // CHECK: %2 = mhlo.compute_reshape_shape %1, %cst : (index, tensor<2xindex>) -> tensor<2xindex>
+  %0 = mhlo.reshape %arg0: (tensor<4x101x32x128xbf16>) -> tensor<404x4096xbf16>
+  return %0: tensor<404x4096xbf16>
+}
+
+// -----
+// CHECK-LABEL: main
+func.func @main(%arg0: tensor<404x128xbf16>) -> tensor<4x101x128xbf16> attributes{tf.entry_function = {input_dynamic_dims = "0:0"}}{
+  // CHECK: %3 = mhlo.dynamic_reshape %arg0, %2 : (tensor<?x128xbf16>, tensor<3xindex>) -> tensor<4x?x128xbf16>
+  %0 = mhlo.reshape %arg0: (tensor<404x128xbf16>) -> tensor<4x101x128xbf16>
+  return %0: tensor<4x101x128xbf16>
+}
+
+
+// -----
+// CHECK-LABEL: main
+func.func @main(%arg0: tensor<4x101xi64>) -> tensor<400xi1> attributes{tf.entry_function = {input_dynamic_dims = "0:1"}}{
+  // CHECK: %cst = arith.constant dense<-1> : tensor<1xindex>
+  // CHECK: %cst_0 = arith.constant dense<1> : tensor<2xindex>
+  // CHECK: %cst_1 = arith.constant dense<[0, 1]> : tensor<2xindex>
+  // CHCEK: %0 = mhlo.constant dense<0> : tensor<i64>
+  // CHECK: %c4 = arith.constant 4 : index
+  // CHECK: %c1 = arith.constant 1 : index
+  // CHECK: %dim = tensor.dim %arg0, %c1 : tensor<4x?xi64>
+  // CHECK: %from_elements = tensor.from_elements %c4, %dim : tensor<2xindex>
+  // CHECK: %1 = mhlo.real_dynamic_slice %arg0, %cst_1, %from_elements, %cst_0 : (tensor<4x?xi64>, tensor<2xindex>, tensor<2xindex>, tensor<2xindex>) -> tensor<4x?xi64>
+  %44 = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 101]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x101xi64>) -> tensor<4x100xi64>
+  %45 = mhlo.reshape %44 : (tensor<4x100xi64>) -> tensor<400xi64>
+  %21 = mhlo.constant dense<0> : tensor<i64>
+  %22 = "mhlo.broadcast_in_dim"(%21) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i64>) -> tensor<400xi64>
+  %23 = mhlo.compare  LT, %45, %22 : (tensor<400xi64>, tensor<400xi64>) -> tensor<400xi1>
+  return %23: tensor<400xi1>
+}