merge master

alibaba · May 20, 2024 · d333f5b · d333f5b
2 parents f1039be + b90fb75
commit d333f5b
Show file tree

Hide file tree

Showing 13 changed files with 48 additions and 109 deletions.
diff --git a/docker/dev/Dockerfile.aarch64 b/docker/dev/Dockerfile.aarch64
@@ -84,7 +84,8 @@ RUN apt-get -y update && \
       zip \
       zlib1g-dev \
       openjdk-11-jdk \
-      patchelf
+      patchelf \
+      pkg-config
 
 RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 1 && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 1 && \

diff --git a/tao_compiler/mlir/disc/BUILD b/tao_compiler/mlir/disc/BUILD
@@ -1011,34 +1011,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "disc_argsmutation_expand",
-    srcs = ["transforms/disc_argsmutation_expand.cc"],
-    hdrs = [
-        "transforms/passes.h",
-        "transforms/rewriters.h",
-    ],
-    deps = [
-        ":lmhlo_disc",
-        ":pass_details",
-        ":placement_utils",
-        ":shape_utils",
-        "@org_tensorflow//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:ShapeTransforms",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:SCFDialect",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "disc_lower_to_library_call",
     srcs = ["transforms/disc_lower_to_library_call.cc"],
@@ -2414,6 +2386,34 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "disc_argsmutation_expand",
+    srcs = ["transforms/disc_argsmutation_expand.cc"],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/rewriters.h",
+    ],
+    deps = [
+        ":lmhlo_disc",
+        ":pass_details",
+        ":placement_utils",
+        ":shape_utils",
+        "@org_tensorflow//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:ShapeTransforms",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:SCFDialect",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "all_passes",
     hdrs = [
@@ -2426,8 +2426,8 @@ cc_library(
         ":quantized_dot_rewriter",
         ":disc_dot_merge",
         ":disc_quantized_dot_merge",
-        ":disc_argsmutation_expand",
         ":disc_algebraic_simplifier",
+        ":disc_argsmutation_expand",
         ":disc_assign_kernel_name",
         ":disc_assign_memory_space",
         ":disc_bf16_expansion",

diff --git a/tao_compiler/mlir/disc/disc_compiler.cc b/tao_compiler/mlir/disc/disc_compiler.cc
@@ -430,7 +430,6 @@ LogicalResult LowerHLOToLLVM(ModuleOp m, const DISCLoweringOptions& options) {
   pm.addPass(mhlo::createLegalizeToLhloPass());
   pm.addNestedPass<FuncOp>(createCanonicalizerPass());
   pm.addPass(mhlo_disc::createDiscLhloRewriterPass());
-
   pm.addNestedPass<FuncOp>(createCanonicalizerPass());
 
   // Convert shape to std. Community ```convert-shape-to-std``` pass
@@ -542,9 +541,6 @@ LogicalResult LowerHLOToLLVM(ModuleOp m, const DISCLoweringOptions& options) {
   pm.addNestedPass<FuncOp>(bufferization::createBufferDeallocationPass());
   pm.addNestedPass<FuncOp>(disc_ral::createDiscBufferDeallocationPass());
 
-
-
-
   pm.addPass(disc_ral::createRalInjectExecutionContextPass());
   pm.addNestedPass<FuncOp>(
       disc_ral::createDiscLowerToLibraryCallPass(gpu_enabled));
@@ -636,6 +632,7 @@ LogicalResult LowerHLOToLLVM(ModuleOp m, const DISCLoweringOptions& options) {
 
   pm.addNestedPass<FuncOp>(disc_ral::createLhloFusionInlinerPass());
 
+  // Expand ArgsMutationOp to redirect memory writing target
   pm.addPass(mhlo_disc::createDiscArgsMutationExpandPass());
 
   if (gpu_enabled) {
@@ -1026,22 +1023,12 @@ Status ConvertTF2MlirHlo(mlir::ModuleOp module_op) {
 
   // Replace const arguments to ConstOp and update argument type if it is a
   // fixed-shaped input
-
-  std::string enable_alg_simp = "";
-
-  tensorflow::ReadStringFromEnvVar("DISC_ENBALE_ALG_SIMP", "",
-                                   &enable_alg_simp);
-  if (enable_alg_simp.size()) {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::disc_ral::createDiscAlgebraicSimplifierPass());
-  }
-
   pm.addPass(mlir::disc_ral::createReviseArgsForStaticRankPass());
 
   // Note that the region-based control-flow produced here still contains
   // function call ops which get inlined by the subsequent inliner pass.
-  // pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
-  // pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateDropWhileShapeInvariantPass());
   // Create a replicated TensorList initialization ops for all of its uses. This

diff --git a/tao_compiler/mlir/disc/disc_compiler_main.cc b/tao_compiler/mlir/disc/disc_compiler_main.cc
@@ -209,12 +209,12 @@ int RealMain() {
                       1e6
                << " s.\n";
 
-  // llvm::dbgs() << "[[ INFO ]] Running TF2XLA\n";
-  // auto s = tensorflow::ConvertTF2MlirHlo(module);
-  // if (!s.ok()) {
-  //  llvm::dbgs() << "ConvertTF2MlirHlo failed: " << s.ToString() << "\n";
-  //  return 1;
-  //}
+  llvm::dbgs() << "[[ INFO ]] Running TF2XLA\n";
+  auto s = tensorflow::ConvertTF2MlirHlo(module);
+  if (!s.ok()) {
+    llvm::dbgs() << "ConvertTF2MlirHlo failed: " << s.ToString() << "\n";
+    return 1;
+  }
 
   if (VLOG_IS_ON(0)) {
     llvm::dbgs() << "======== BEGIN After TF2HLO =========\n";

diff --git a/tao_compiler/mlir/disc/transforms/disc_argsmutation_expand.cc b/tao_compiler/mlir/disc/transforms/disc_argsmutation_expand.cc
@@ -67,7 +67,6 @@ struct LhloDISCArgsMutationOpConverter
                                 PatternRewriter& rewriter) const override {
     auto op = lhloOp.getOperation();
     auto operands = op->getOperands();
-    // Value value = backtraceOperand<memref::ReinterpretCastOp>(operands[0]);
     operands[0].replaceAllUsesWith(operands[1]);
     rewriter.eraseOp(op);
     return success();
@@ -106,4 +105,4 @@ std::unique_ptr<OperationPass<ModuleOp>> createDiscArgsMutationExpandPass() {
   return std::make_unique<DiscArgsMutationExpandPass>();
 }
 }  // namespace mhlo_disc
-}  // namespace mlir
+}  // namespace mlir
diff --git a/tao_compiler/mlir/disc/transforms/disc_collective_ops_rewriter.cc b/tao_compiler/mlir/disc/transforms/disc_collective_ops_rewriter.cc
@@ -164,7 +164,6 @@ struct AllReduceOpConverter : public OpRewritePattern<mhlo::AllReduceOp> {
         collective_done_op->setAttr(
             "call_target_name",
             rewriter.getStringAttr("ral_async_collective_done"));
-        // collective_done_op->moveBefore(original_consumer);
         newOutputs.push_back(collective_done_op.getResult(0));
       } else {
         newOutputs.push_back(reduce_op.getResult(0));
@@ -245,7 +244,6 @@ struct AllGatherOpConverter : public OpRewritePattern<mhlo::AllGatherOp> {
       collective_done_op->setAttr(
           "call_target_name",
           rewriter.getStringAttr("ral_async_collective_done"));
-      // collective_done_op->moveBefore(original_consumer);
       rewriter.replaceOp(op, collective_done_op.getResult(0));
     } else {
       rewriter.replaceOp(op, all_gather_op.getResult(0));
@@ -332,7 +330,6 @@ struct ReduceScatterOpConverter
       collective_done_op->setAttr(
           "call_target_name",
           rewriter.getStringAttr("ral_async_collective_done"));
-      // collective_done_op->moveBefore(original_consumer);
       rewriter.replaceOp(op, collective_done_op.getResult(0));
     } else {
       rewriter.replaceOp(op, reduce_scatter_op.getResult(0));

diff --git a/tao_compiler/mlir/disc/transforms/disc_input_output_alias.cc b/tao_compiler/mlir/disc/transforms/disc_input_output_alias.cc
@@ -137,37 +137,21 @@ struct DiscInputOutputAliasPass
     auto outputs = returnOp.getOperands();
 
     // Insert mhlo_disc::ArgsMutationOp
-    double total_reused_bytes = 0;
     for (int i = 0; i < params_index.size(); i++) {
       if (outputs[outputs_index[i]] == params[params_index[i]]) {
         continue;
       }
       // DISC now only support one-hop buffer sharing.
       auto defineOp = outputs[outputs_index[i]].getDefiningOp();
-
-      // OptimizationBarrierOp doesnt need buffer reuse.
       if (llvm::isa<mhlo::OptimizationBarrierOp>(defineOp)) {
         continue;
       }
 
-      // for (const auto& value : defineOp->getOperands()) {
-      //  if (params[params_index[i]] == value) {
       builder.setInsertionPointAfter(defineOp);
       builder.create<mhlo_disc::ArgsMutationOp>(main_func.getLoc(),
                                                 outputs[outputs_index[i]],
                                                 params[params_index[i]]);
-      total_reused_bytes += outputs[outputs_index[i]]
-                                .getType()
-                                .dyn_cast<RankedTensorType>()
-                                .getNumElements() *
-                            4;
-      //    break;
-      //  }
-      //}
     }
-
-    llvm::dbgs() << "Total Reused Buffer Size For Module Input and Output Is "
-                 << total_reused_bytes / 1024 / 1024 / 1024 << " GB\n";
   }
 };
 

diff --git a/tao_compiler/mlir/disc/transforms/disc_lhlo_rewriter.cc b/tao_compiler/mlir/disc/transforms/disc_lhlo_rewriter.cc
@@ -66,21 +66,6 @@ Value backtraceOperand(Value operand) {
   return operand;
 }
 
-struct LhloArgsMutationOpRewriter
-    : public OpRewritePattern<lmhlo_disc::ArgsMutationOp> {
-  explicit LhloArgsMutationOpRewriter(MLIRContext* context)
-      : OpRewritePattern(context) {}
-  LogicalResult matchAndRewrite(lmhlo_disc::ArgsMutationOp lhloOp,
-                                PatternRewriter& rewriter) const override {
-    auto op = lhloOp.getOperation();
-    auto operands = op->getOperands();
-    Value value = backtraceOperand<memref::ReinterpretCastOp>(operands[0]);
-    value.replaceAllUsesWith(operands[1]);
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
 struct LhloConcatenateOpConverter
     : public OpRewritePattern<lmhlo::ConcatenateOp> {
   explicit LhloConcatenateOpConverter(MLIRContext* context)

diff --git a/tao_compiler/mlir/disc/transforms/mhlo_disc_passes.td b/tao_compiler/mlir/disc/transforms/mhlo_disc_passes.td
@@ -38,4 +38,4 @@ def DiscArgsMutationExpandPass : Pass<"disc-argsmutation-expand", "ModuleOp"> {
 def DiscOpSchedulePass : Pass<"disc-op-schedule", "ModuleOp"> {
   let summary = "Schedule ops in a function";
   let constructor = "createDiscOpSchedulePass()";
-}
+}
diff --git a/tao_compiler/mlir/disc/transforms/passes.h b/tao_compiler/mlir/disc/transforms/passes.h
@@ -349,9 +349,11 @@ std::unique_ptr<OperationPass<ModuleOp>> createDiscLhloRewriterPass();
 
 std::unique_ptr<OperationPass<ModuleOp>>
 createDiscOptimizationBarrierExpandPass();
-std::unique_ptr<OperationPass<ModuleOp>> createDiscArgsMutationExpandPass();
+
 std::unique_ptr<OperationPass<ModuleOp>> createDiscOpSchedulePass();
 
+std::unique_ptr<OperationPass<ModuleOp>> createDiscArgsMutationExpandPass();
+
 }  // namespace mhlo_disc
 }  // namespace mlir
 

diff --git a/tao_compiler/mlir/disc/transforms/tests/input-mutation.mlir b/tao_compiler/mlir/disc/transforms/tests/input-mutation.mlir
@@ -1,4 +1,4 @@
-// RUN: disc-opt %s -disc-hlo-legalize-to-lhlo -hlo-legalize-to-lhlo -canonicalize -disc-lhlo-rewriter -split-input-file | FileCheck %s
+// RUN: disc-opt %s -disc-hlo-legalize-to-lhlo -hlo-legalize-to-lhlo -canonicalize -disc-lhlo-rewriter -disc-argsmutation-expand -split-input-file | FileCheck %s
 
 func.func @input_mutation(%arg0: tensor<8x32xf32>, %arg1: tensor<8x32xf32>) -> tensor<8x32xf32> {
   // CHECK: "lmhlo.add"(%arg0, %arg1, %arg0) : (memref<8x32xf32>, memref<8x32xf32>, memref<8x32xf32>) -> ()

diff --git a/tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.cc b/tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.cc
@@ -171,15 +171,6 @@ std::unique_ptr<BaseContext> MakeBaseCudaContext(
                      new ::tao::ral::gpu::GPUDriver(ctx.get())));
 
   ctx->getOrCreateResource(kRalBaseCudaContextState, [opt, gpu_opt]() {
-
-    // 获取设备句柄
-    CUdevice cuDevice;
-    cuDeviceGet(&cuDevice, gpu_opt.device_ordinal);
-
-    // 创建一个新的上下文，并将其设置为当前线程的活动上下文
-    CUcontext cuContext;
-    cuCtxCreate(&cuContext, 0, cuDevice);
-
     auto state = new BaseCudaContextState;
     state->stream = gpu_opt.stream;
     state->nccl_comm = gpu_opt.nccl_comm;
@@ -446,11 +437,7 @@ void ral_base_cuda_launch(ExecutionContext* ctx, void** blobs, size_t num_blobs,
   reportErrorIfAny(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY,
                                   blockZ, smem, stream, params, nullptr),
                    ctx, "LaunchKernel");
-#endif'
-
-  if(std::getenv("CUDA_LAUNCH_BLOCKING") != nullptr) {
-    reportErrorIfAny(cuStreamSynchronize(state->stream), ctx, "StreamSync");
-  }
+#endif
 }
 
 stream_t ral_base_cuda_get_stream(ExecutionContext* ctx, int32_t stream_id) {

diff --git a/tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.h b/tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.h
@@ -69,16 +69,13 @@ struct BaseCudaExecutionContext
   GpuStreamHandle getCommStream();
 
   // We need to sync on the gpu stream before we fetch the first output.
-  bool synced = true;
+  bool synced = false;
   // all buffer allocated by the gpu_allocator
   std::unordered_map<const_buffer_t, int> device_ptr_map;
 
   // map int64 -> cudaEvent_t
   std::map<int64_t, cudaEvent_t> async_pair_tokens;
 
-  // Device context
-  CUcontext cuContext;
-
  protected:
   virtual void setOutputDeleter(OutputBufferWrapper& output) override;
 };