Skip to content

Commit

Permalink
merge master
Browse files Browse the repository at this point in the history
  • Loading branch information
eedalong committed May 20, 2024
2 parents f1039be + b90fb75 commit d333f5b
Show file tree
Hide file tree
Showing 13 changed files with 48 additions and 109 deletions.
3 changes: 2 additions & 1 deletion docker/dev/Dockerfile.aarch64
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ RUN apt-get -y update && \
zip \
zlib1g-dev \
openjdk-11-jdk \
patchelf
patchelf \
pkg-config

RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 1 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 1 && \
Expand Down
58 changes: 29 additions & 29 deletions tao_compiler/mlir/disc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1011,34 +1011,6 @@ cc_library(
alwayslink = 1,
)

cc_library(
name = "disc_argsmutation_expand",
srcs = ["transforms/disc_argsmutation_expand.cc"],
hdrs = [
"transforms/passes.h",
"transforms/rewriters.h",
],
deps = [
":lmhlo_disc",
":pass_details",
":placement_utils",
":shape_utils",
"@org_tensorflow//tensorflow/compiler/xla/mlir_hlo:lhlo",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:FuncDialect",
"@llvm-project//mlir:IR",
"@llvm-project//mlir:MemRefDialect",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:ShapeDialect",
"@llvm-project//mlir:ShapeTransforms",
"@llvm-project//mlir:Support",
"@llvm-project//mlir:TensorDialect",
"@llvm-project//mlir:Transforms",
"@llvm-project//mlir:SCFDialect",
],
alwayslink = 1,
)

cc_library(
name = "disc_lower_to_library_call",
srcs = ["transforms/disc_lower_to_library_call.cc"],
Expand Down Expand Up @@ -2414,6 +2386,34 @@ cc_library(
alwayslink = 1,
)

cc_library(
name = "disc_argsmutation_expand",
srcs = ["transforms/disc_argsmutation_expand.cc"],
hdrs = [
"transforms/passes.h",
"transforms/rewriters.h",
],
deps = [
":lmhlo_disc",
":pass_details",
":placement_utils",
":shape_utils",
"@org_tensorflow//tensorflow/compiler/xla/mlir_hlo:lhlo",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:FuncDialect",
"@llvm-project//mlir:IR",
"@llvm-project//mlir:MemRefDialect",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:ShapeDialect",
"@llvm-project//mlir:ShapeTransforms",
"@llvm-project//mlir:Support",
"@llvm-project//mlir:TensorDialect",
"@llvm-project//mlir:Transforms",
"@llvm-project//mlir:SCFDialect",
],
alwayslink = 1,
)

cc_library(
name = "all_passes",
hdrs = [
Expand All @@ -2426,8 +2426,8 @@ cc_library(
":quantized_dot_rewriter",
":disc_dot_merge",
":disc_quantized_dot_merge",
":disc_argsmutation_expand",
":disc_algebraic_simplifier",
":disc_argsmutation_expand",
":disc_assign_kernel_name",
":disc_assign_memory_space",
":disc_bf16_expansion",
Expand Down
19 changes: 3 additions & 16 deletions tao_compiler/mlir/disc/disc_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,6 @@ LogicalResult LowerHLOToLLVM(ModuleOp m, const DISCLoweringOptions& options) {
pm.addPass(mhlo::createLegalizeToLhloPass());
pm.addNestedPass<FuncOp>(createCanonicalizerPass());
pm.addPass(mhlo_disc::createDiscLhloRewriterPass());

pm.addNestedPass<FuncOp>(createCanonicalizerPass());

// Convert shape to std. Community ```convert-shape-to-std``` pass
Expand Down Expand Up @@ -542,9 +541,6 @@ LogicalResult LowerHLOToLLVM(ModuleOp m, const DISCLoweringOptions& options) {
pm.addNestedPass<FuncOp>(bufferization::createBufferDeallocationPass());
pm.addNestedPass<FuncOp>(disc_ral::createDiscBufferDeallocationPass());




pm.addPass(disc_ral::createRalInjectExecutionContextPass());
pm.addNestedPass<FuncOp>(
disc_ral::createDiscLowerToLibraryCallPass(gpu_enabled));
Expand Down Expand Up @@ -636,6 +632,7 @@ LogicalResult LowerHLOToLLVM(ModuleOp m, const DISCLoweringOptions& options) {

pm.addNestedPass<FuncOp>(disc_ral::createLhloFusionInlinerPass());

// Expand ArgsMutationOp to redirect memory writing target
pm.addPass(mhlo_disc::createDiscArgsMutationExpandPass());

if (gpu_enabled) {
Expand Down Expand Up @@ -1026,22 +1023,12 @@ Status ConvertTF2MlirHlo(mlir::ModuleOp module_op) {

// Replace const arguments to ConstOp and update argument type if it is a
// fixed-shaped input

std::string enable_alg_simp = "";

tensorflow::ReadStringFromEnvVar("DISC_ENBALE_ALG_SIMP", "",
&enable_alg_simp);
if (enable_alg_simp.size()) {
pm.addNestedPass<mlir::func::FuncOp>(
mlir::disc_ral::createDiscAlgebraicSimplifierPass());
}

pm.addPass(mlir::disc_ral::createReviseArgsForStaticRankPass());

// Note that the region-based control-flow produced here still contains
// function call ops which get inlined by the subsequent inliner pass.
// pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
// pm.addPass(mlir::createInlinerPass());
pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
pm.addPass(mlir::createInlinerPass());
pm.addNestedPass<mlir::func::FuncOp>(
mlir::TF::CreateDropWhileShapeInvariantPass());
// Create a replicated TensorList initialization ops for all of its uses. This
Expand Down
12 changes: 6 additions & 6 deletions tao_compiler/mlir/disc/disc_compiler_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,12 @@ int RealMain() {
1e6
<< " s.\n";

// llvm::dbgs() << "[[ INFO ]] Running TF2XLA\n";
// auto s = tensorflow::ConvertTF2MlirHlo(module);
// if (!s.ok()) {
// llvm::dbgs() << "ConvertTF2MlirHlo failed: " << s.ToString() << "\n";
// return 1;
//}
llvm::dbgs() << "[[ INFO ]] Running TF2XLA\n";
auto s = tensorflow::ConvertTF2MlirHlo(module);
if (!s.ok()) {
llvm::dbgs() << "ConvertTF2MlirHlo failed: " << s.ToString() << "\n";
return 1;
}

if (VLOG_IS_ON(0)) {
llvm::dbgs() << "======== BEGIN After TF2HLO =========\n";
Expand Down
3 changes: 1 addition & 2 deletions tao_compiler/mlir/disc/transforms/disc_argsmutation_expand.cc
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ struct LhloDISCArgsMutationOpConverter
PatternRewriter& rewriter) const override {
auto op = lhloOp.getOperation();
auto operands = op->getOperands();
// Value value = backtraceOperand<memref::ReinterpretCastOp>(operands[0]);
operands[0].replaceAllUsesWith(operands[1]);
rewriter.eraseOp(op);
return success();
Expand Down Expand Up @@ -106,4 +105,4 @@ std::unique_ptr<OperationPass<ModuleOp>> createDiscArgsMutationExpandPass() {
return std::make_unique<DiscArgsMutationExpandPass>();
}
} // namespace mhlo_disc
} // namespace mlir
} // namespace mlir
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ struct AllReduceOpConverter : public OpRewritePattern<mhlo::AllReduceOp> {
collective_done_op->setAttr(
"call_target_name",
rewriter.getStringAttr("ral_async_collective_done"));
// collective_done_op->moveBefore(original_consumer);
newOutputs.push_back(collective_done_op.getResult(0));
} else {
newOutputs.push_back(reduce_op.getResult(0));
Expand Down Expand Up @@ -245,7 +244,6 @@ struct AllGatherOpConverter : public OpRewritePattern<mhlo::AllGatherOp> {
collective_done_op->setAttr(
"call_target_name",
rewriter.getStringAttr("ral_async_collective_done"));
// collective_done_op->moveBefore(original_consumer);
rewriter.replaceOp(op, collective_done_op.getResult(0));
} else {
rewriter.replaceOp(op, all_gather_op.getResult(0));
Expand Down Expand Up @@ -332,7 +330,6 @@ struct ReduceScatterOpConverter
collective_done_op->setAttr(
"call_target_name",
rewriter.getStringAttr("ral_async_collective_done"));
// collective_done_op->moveBefore(original_consumer);
rewriter.replaceOp(op, collective_done_op.getResult(0));
} else {
rewriter.replaceOp(op, reduce_scatter_op.getResult(0));
Expand Down
16 changes: 0 additions & 16 deletions tao_compiler/mlir/disc/transforms/disc_input_output_alias.cc
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -137,37 +137,21 @@ struct DiscInputOutputAliasPass
auto outputs = returnOp.getOperands();

// Insert mhlo_disc::ArgsMutationOp
double total_reused_bytes = 0;
for (int i = 0; i < params_index.size(); i++) {
if (outputs[outputs_index[i]] == params[params_index[i]]) {
continue;
}
// DISC now only support one-hop buffer sharing.
auto defineOp = outputs[outputs_index[i]].getDefiningOp();

// OptimizationBarrierOp doesnt need buffer reuse.
if (llvm::isa<mhlo::OptimizationBarrierOp>(defineOp)) {
continue;
}

// for (const auto& value : defineOp->getOperands()) {
// if (params[params_index[i]] == value) {
builder.setInsertionPointAfter(defineOp);
builder.create<mhlo_disc::ArgsMutationOp>(main_func.getLoc(),
outputs[outputs_index[i]],
params[params_index[i]]);
total_reused_bytes += outputs[outputs_index[i]]
.getType()
.dyn_cast<RankedTensorType>()
.getNumElements() *
4;
// break;
// }
//}
}

llvm::dbgs() << "Total Reused Buffer Size For Module Input and Output Is "
<< total_reused_bytes / 1024 / 1024 / 1024 << " GB\n";
}
};

Expand Down
15 changes: 0 additions & 15 deletions tao_compiler/mlir/disc/transforms/disc_lhlo_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,6 @@ Value backtraceOperand(Value operand) {
return operand;
}

struct LhloArgsMutationOpRewriter
: public OpRewritePattern<lmhlo_disc::ArgsMutationOp> {
explicit LhloArgsMutationOpRewriter(MLIRContext* context)
: OpRewritePattern(context) {}
LogicalResult matchAndRewrite(lmhlo_disc::ArgsMutationOp lhloOp,
PatternRewriter& rewriter) const override {
auto op = lhloOp.getOperation();
auto operands = op->getOperands();
Value value = backtraceOperand<memref::ReinterpretCastOp>(operands[0]);
value.replaceAllUsesWith(operands[1]);
rewriter.eraseOp(op);
return success();
}
};

struct LhloConcatenateOpConverter
: public OpRewritePattern<lmhlo::ConcatenateOp> {
explicit LhloConcatenateOpConverter(MLIRContext* context)
Expand Down
2 changes: 1 addition & 1 deletion tao_compiler/mlir/disc/transforms/mhlo_disc_passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ def DiscArgsMutationExpandPass : Pass<"disc-argsmutation-expand", "ModuleOp"> {
def DiscOpSchedulePass : Pass<"disc-op-schedule", "ModuleOp"> {
let summary = "Schedule ops in a function";
let constructor = "createDiscOpSchedulePass()";
}
}
4 changes: 3 additions & 1 deletion tao_compiler/mlir/disc/transforms/passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,11 @@ std::unique_ptr<OperationPass<ModuleOp>> createDiscLhloRewriterPass();

std::unique_ptr<OperationPass<ModuleOp>>
createDiscOptimizationBarrierExpandPass();
std::unique_ptr<OperationPass<ModuleOp>> createDiscArgsMutationExpandPass();

std::unique_ptr<OperationPass<ModuleOp>> createDiscOpSchedulePass();

std::unique_ptr<OperationPass<ModuleOp>> createDiscArgsMutationExpandPass();

} // namespace mhlo_disc
} // namespace mlir

Expand Down
2 changes: 1 addition & 1 deletion tao_compiler/mlir/disc/transforms/tests/input-mutation.mlir
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: disc-opt %s -disc-hlo-legalize-to-lhlo -hlo-legalize-to-lhlo -canonicalize -disc-lhlo-rewriter -split-input-file | FileCheck %s
// RUN: disc-opt %s -disc-hlo-legalize-to-lhlo -hlo-legalize-to-lhlo -canonicalize -disc-lhlo-rewriter -disc-argsmutation-expand -split-input-file | FileCheck %s

func.func @input_mutation(%arg0: tensor<8x32xf32>, %arg1: tensor<8x32xf32>) -> tensor<8x32xf32> {
// CHECK: "lmhlo.add"(%arg0, %arg1, %arg0) : (memref<8x32xf32>, memref<8x32xf32>, memref<8x32xf32>) -> ()
Expand Down
15 changes: 1 addition & 14 deletions tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,6 @@ std::unique_ptr<BaseContext> MakeBaseCudaContext(
new ::tao::ral::gpu::GPUDriver(ctx.get())));

ctx->getOrCreateResource(kRalBaseCudaContextState, [opt, gpu_opt]() {

// 获取设备句柄
CUdevice cuDevice;
cuDeviceGet(&cuDevice, gpu_opt.device_ordinal);

// 创建一个新的上下文,并将其设置为当前线程的活动上下文
CUcontext cuContext;
cuCtxCreate(&cuContext, 0, cuDevice);

auto state = new BaseCudaContextState;
state->stream = gpu_opt.stream;
state->nccl_comm = gpu_opt.nccl_comm;
Expand Down Expand Up @@ -446,11 +437,7 @@ void ral_base_cuda_launch(ExecutionContext* ctx, void** blobs, size_t num_blobs,
reportErrorIfAny(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY,
blockZ, smem, stream, params, nullptr),
ctx, "LaunchKernel");
#endif'
if(std::getenv("CUDA_LAUNCH_BLOCKING") != nullptr) {
reportErrorIfAny(cuStreamSynchronize(state->stream), ctx, "StreamSync");
}
#endif
}

stream_t ral_base_cuda_get_stream(ExecutionContext* ctx, int32_t stream_id) {
Expand Down
5 changes: 1 addition & 4 deletions tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,13 @@ struct BaseCudaExecutionContext
GpuStreamHandle getCommStream();

// We need to sync on the gpu stream before we fetch the first output.
bool synced = true;
bool synced = false;
// all buffer allocated by the gpu_allocator
std::unordered_map<const_buffer_t, int> device_ptr_map;

// map int64 -> cudaEvent_t
std::map<int64_t, cudaEvent_t> async_pair_tokens;

// Device context
CUcontext cuContext;

protected:
virtual void setOutputDeleter(OutputBufferWrapper& output) override;
};
Expand Down

0 comments on commit d333f5b

Please sign in to comment.